diff --git a/evaluations/.gitignore b/evaluations/.gitignore
new file mode 100644
index 0000000..0d20b64
--- /dev/null
+++ b/evaluations/.gitignore
@@ -0,0 +1 @@
+*.pyc
diff --git a/evaluations/README.adoc b/evaluations/README.adoc
new file mode 100644
index 0000000..8547c49
--- /dev/null
+++ b/evaluations/README.adoc
@@ -0,0 +1,158 @@
+= Semantic Anchor Evaluations
+:toc:
+
+== Overview
+
+Multiple-choice evaluation framework for testing whether semantic anchors work across different LLMs.
+See the link:../docs/anchor-evaluations.adoc[full concept document] for background and methodology.
+
+== Quick Start
+
+=== Prerequisites
+
+* Python 3.10+
+* `pyyaml` package: `pip install pyyaml`
+* At least one of:
+** Claude Code CLI (authenticated)
+** OpenAI API key (`OPENAI_API_KEY` environment variable)
+** Ollama running locally
+
+=== Running the Pilot
+
+[source,bash]
+----
+cd website
+
+# Claude Sonnet (default, via CLI)
+python3 evaluations/pilot.py
+
+# Claude Haiku
+python3 evaluations/pilot.py --model claude-haiku
+
+# GPT-4o-mini (requires OPENAI_API_KEY)
+python3 evaluations/pilot.py --model openai
+
+# Ollama (requires local server + model)
+ollama serve & # start server if not running
+ollama pull qwen3:4b # pull model (once)
+python3 evaluations/pilot.py --model ollama # uses qwen3:4b by default
+python3 evaluations/pilot.py --model ollama --ollama-model mistral # other model
+
+# Multiple models at once
+python3 evaluations/pilot.py --model claude-cli claude-haiku openai
+
+# Dry run (show prompts without sending)
+python3 evaluations/pilot.py --dry-run
+----
+
+=== Available Models
+
+[cols="1,1,2"]
+|===
+|Flag |Model |Notes
+
+|`claude-cli`
+|Claude Sonnet (via CLI)
+|Default. Requires `claude` CLI authenticated.
+
+|`claude-haiku`
+|Claude Haiku (via CLI)
+|Smallest Claude model. Good lower-bound test.
+
+|`openai`
+|GPT-4o-mini (via API)
+|Requires `OPENAI_API_KEY`.
+
+|`claude`
+|Claude Sonnet (via API)
+|Requires `ANTHROPIC_API_KEY`. Alternative to CLI.
+
+|`ollama`
+|Local model (via Ollama)
+|Requires Ollama server on `localhost:11434`. Default: `qwen3:4b`, override with `--ollama-model`.
+|===
+
+== Directory Structure
+
+[source]
+----
+evaluations/
+├── README.adoc # This file
+├── pilot.py # Evaluation runner script
+├── specs/ # Question specs (YAML)
+│ ├── arc42.yaml
+│ ├── docs-as-code.yaml
+│ ├── mece.yaml
+│ ├── tdd-london-school.yaml
+│ └── timtowtdi.yaml
+└── results/ # Raw results (JSON, timestamped)
+ └── pilot-*.json
+----
+
+== Question Spec Format
+
+Each anchor has a YAML file with multiple-choice questions:
+
+[source,yaml]
+----
+anchor: tdd-london-school
+tier: 3
+
+questions:
+ recognition: # Level 1: Does the model identify the anchor?
+ question: |
+ Which of the following best describes "TDD, London School"?
+ options:
+ A: ... # Distractor (e.g., Chicago School description)
+ B: ... # Correct answer
+ C: ... # Distractor (e.g., BDD description)
+ D: ... # Distractor
+ correct: B
+
+ application: # Level 2: Does it change behavior?
+ scenario: |
+ You are reviewing a PR. ...
+ anchor_prompt: "using TDD, London School principles"
+ paraphrase_prompt: "Write isolated tests for the service layer"
+ options: ...
+ correct: B
+
+ consistency: # Level 4: Same answer across aliases/languages?
+ variants:
+ - 'Question with canonical name'
+ - 'Question with alias'
+ language_variant: 'Frage auf Deutsch'
+ options: ...
+ correct: B
+----
+
+== Scoring
+
+* Each question runs *4 times* with randomized option order (position bias mitigation)
+* Score = percentage of correct answers across the 4 runs
+* Response parsing: extracts first capital letter A–D from response
+* Results saved as timestamped JSON in `results/`
+
+== Pilot Results (2026-03-24)
+
+[cols="1,1,1,1"]
+|===
+|Model |Average |Best |Worst
+
+|Claude Sonnet 4.6
+|100%
+|all 100%
+|—
+
+|Claude Haiku 4.5
+|100%
+|all 100%
+|—
+
+|GPT-4o-mini
+|81%
+|Recognition: arc42, MECE, TIMTOWTDI (100%)
+|TDD London School Recognition (25%)
+|===
+
+Key finding: *Position bias is real.* GPT-4o-mini recognizes "TDD, London School" only 25% of the time -- it picks the correct answer only when it happens to be in a favorable position.
diff --git a/evaluations/fill-distractors.py b/evaluations/fill-distractors.py
new file mode 100644
index 0000000..eeb002d
--- /dev/null
+++ b/evaluations/fill-distractors.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python3
+"""
+Fill placeholder distractors in evaluation specs using Claude API.
+
+Reads specs with PLACEHOLDER_A/C/D options and asks Claude to generate
+plausible but wrong distractors based on the anchor's domain.
+
+Usage:
+ python3 evaluations/fill-distractors.py # Fill all placeholders
+ python3 evaluations/fill-distractors.py --dry-run # Preview prompts
+ python3 evaluations/fill-distractors.py --anchor arc42 # Single anchor
+"""
+
+import argparse
+import json
+import os
+import sys
+from pathlib import Path
+
+try:
+ import yaml
+except ImportError:
+ print("PyYAML required: pip install pyyaml")
+ sys.exit(1)
+
+SPECS_DIR = Path(__file__).parent / "specs"
+
+
+def needs_distractors(spec):
+ """Check if spec has placeholder distractors."""
+ q = spec.get("questions", {}).get("recognition", {})
+ options = q.get("options", {})
+ return any("PLACEHOLDER" in str(v) for v in options.values())
+
+
+def generate_distractors(spec):
+ """Use Claude API to generate 3 plausible distractors."""
+ try:
+ import anthropic
+ except ImportError:
+ print("anthropic package required: pip install anthropic")
+ sys.exit(1)
+
+ q = spec["questions"]["recognition"]
+ correct = q["options"]["B"]
+ title = q["question"].strip().split('"')[1] if '"' in q["question"] else spec["anchor"]
+ related = q.get("_related", [])
+ proponents = q.get("_proponents", "")
+
+ prompt = f"""Generate 3 plausible but WRONG multiple-choice distractors for this question:
+
+Question: Which of the following best describes "{title}"?
+Correct answer: {correct}
+
+Requirements for distractors:
+- Each distractor should be a one-sentence description of a DIFFERENT but related concept
+- They must be wrong but sound plausible to someone unfamiliar with the topic
+- All 4 options (correct + 3 distractors) should be similar in length
+- Do NOT include the correct concept in any distractor
+- Draw distractors from adjacent concepts in software engineering, architecture, or methodology
+{f"- Related anchors for inspiration: {', '.join(related)}" if related else ""}
+{f"- The correct answer is associated with: {proponents}" if proponents else ""}
+
+Return ONLY a JSON object with keys "A", "C", "D" containing the 3 distractor strings. No explanation."""
+
+ client = anthropic.Anthropic()
+ response = client.messages.create(
+ model="claude-sonnet-4-20250514",
+ max_tokens=300,
+ temperature=0.7, # some creativity for diverse distractors
+ messages=[{"role": "user", "content": prompt}],
+ )
+
+ text = response.content[0].text.strip()
+ # Parse JSON from response (might be wrapped in ```json ... ```)
+ if "```" in text:
+ text = text.split("```")[1]
+ if text.startswith("json"):
+ text = text[4:]
+ text = text.strip()
+
+ return json.loads(text)
+
+
+def main():
+ parser = argparse.ArgumentParser(description="Fill placeholder distractors using Claude API")
+ parser.add_argument("--dry-run", action="store_true", help="Preview without writing")
+ parser.add_argument("--anchor", help="Process single anchor")
+ args = parser.parse_args()
+
+ specs_to_fill = []
+ for f in sorted(SPECS_DIR.glob("*.yaml")):
+ spec = yaml.safe_load(f.read_text(encoding="utf-8"))
+ if args.anchor and spec["anchor"] != args.anchor:
+ continue
+ if needs_distractors(spec):
+ specs_to_fill.append((f, spec))
+
+ print(f"Found {len(specs_to_fill)} specs needing distractors")
+
+ for filepath, spec in specs_to_fill:
+ anchor_id = spec["anchor"]
+ print(f" {anchor_id}...", end=" ", flush=True)
+
+ if args.dry_run:
+ print("(dry run)")
+ continue
+
+ try:
+ distractors = generate_distractors(spec)
+ q = spec["questions"]["recognition"]
+ q["options"]["A"] = distractors["A"]
+ q["options"]["C"] = distractors["C"]
+ q["options"]["D"] = distractors["D"]
+
+ # Remove helper notes
+ q.pop("_note", None)
+ q.pop("_related", None)
+ q.pop("_proponents", None)
+ q.pop("_also_known_as", None)
+
+ with open(filepath, "w", encoding="utf-8") as fh:
+ yaml.dump(spec, fh, default_flow_style=False, allow_unicode=True, sort_keys=False)
+ print("OK")
+
+ except Exception as e:
+ print(f"ERROR: {e}")
+
+ print("\nDone. Review the generated distractors before running evaluations!")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/evaluations/generate-l1-specs.py b/evaluations/generate-l1-specs.py
new file mode 100644
index 0000000..198746d
--- /dev/null
+++ b/evaluations/generate-l1-specs.py
@@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+"""
+Generate Level 1 (Recognition) evaluation specs from .adoc anchor metadata.
+
+Reads each anchor's Core Concepts and Related Anchors to produce:
+- A correct answer from the anchor's core description
+- 3 plausible distractors from related/adjacent anchors
+
+Output: YAML specs in evaluations/specs/ (only recognition section).
+Existing specs are preserved — only missing anchors are generated.
+
+Usage:
+ python3 evaluations/generate-l1-specs.py # Generate all Tier 3
+ python3 evaluations/generate-l1-specs.py --dry-run # Preview without writing
+ python3 evaluations/generate-l1-specs.py --anchor arc42 # Single anchor
+"""
+
+import argparse
+import os
+import re
+import sys
+from pathlib import Path
+
+try:
+ import yaml
+except ImportError:
+ print("PyYAML required: pip install pyyaml")
+ sys.exit(1)
+
+ANCHORS_DIR = Path(__file__).parent.parent / "docs" / "anchors"
+SPECS_DIR = Path(__file__).parent / "specs"
+
+# Skip these anchors (templates, meta, sub-patterns handled by umbrella)
+SKIP_PREFIXES = ["_template", "gof-", "solid-", "test-double-"]
+SKIP_EXACT = ["what-qualifies-as-a-semantic-anchor", "gof-design-patterns",
+ "solid-principles", "test-double-meszaros"]
+
+
+def parse_adoc(filepath):
+ """Extract metadata from an .adoc anchor file."""
+ content = filepath.read_text(encoding="utf-8")
+ lines = content.split("\n")
+
+ result = {
+ "id": filepath.stem,
+ "title": "",
+ "tier": None,
+ "categories": "",
+ "related": [],
+ "proponents": "",
+ "also_known_as": "",
+ "core_concepts": [],
+ "when_to_use": [],
+ }
+
+ # Parse attributes
+ for line in lines:
+ if line.startswith("= "):
+ result["title"] = line[2:].strip()
+ elif line.startswith(":tier:"):
+ result["tier"] = int(line.split(":tier:")[1].strip())
+ elif line.startswith(":categories:"):
+ result["categories"] = line.split(":categories:")[1].strip()
+ elif line.startswith(":related:"):
+ result["related"] = [r.strip() for r in line.split(":related:")[1].strip().split(",")]
+ elif line.startswith(":proponents:"):
+ result["proponents"] = line.split(":proponents:")[1].strip()
+
+ # Parse core concepts (definition list items)
+ in_core = False
+ in_when = False
+ for line in lines:
+ if "Core Concepts" in line:
+ in_core = True
+ in_when = False
+ continue
+ if "When to Use" in line:
+ in_core = False
+ in_when = True
+ continue
+ if "Related" in line or "Contrast" in line or "Technical" in line:
+ in_core = False
+ in_when = False
+ continue
+
+ if in_core and "::" in line:
+ term = line.split("::")[0].strip()
+ desc = line.split("::", 1)[1].strip() if "::" in line else ""
+ if term and not term.startswith("[") and not term.startswith("Key Proponent"):
+ result["core_concepts"].append({"term": term, "desc": desc})
+ elif in_when and line.strip().startswith("*"):
+ result["when_to_use"].append(line.strip().lstrip("* "))
+
+ # Also known as
+ for line in lines:
+ if "Also known as::" in line:
+ result["also_known_as"] = line.split("Also known as::")[1].strip()
+
+ return result
+
+
+def build_correct_answer(anchor):
+ """Build a one-sentence correct answer from core concepts."""
+ concepts = anchor["core_concepts"][:4]
+ if not concepts:
+ return None
+
+ parts = []
+ for c in concepts:
+ if c["desc"]:
+ parts.append(c["desc"].rstrip("."))
+ else:
+ parts.append(c["term"])
+
+ if len(parts) >= 2:
+ return f"{parts[0]}; {parts[1].lower()}"
+ return parts[0]
+
+
+def generate_spec(anchor, all_anchors):
+ """Generate a YAML spec dict for one anchor."""
+ correct = build_correct_answer(anchor)
+ if not correct:
+ return None
+
+ spec = {
+ "anchor": anchor["id"],
+ "tier": anchor["tier"],
+ "questions": {
+ "recognition": {
+ "question": f'Which of the following best describes "{anchor["title"]}"?\n',
+ "options": {
+ "A": "PLACEHOLDER_A",
+ "B": correct,
+ "C": "PLACEHOLDER_C",
+ "D": "PLACEHOLDER_D",
+ },
+ "correct": "B",
+ "_note": "REVIEW NEEDED: Distractors are placeholders. Replace A, C, D with plausible wrong answers from related anchors.",
+ "_related": anchor["related"],
+ "_proponents": anchor["proponents"],
+ "_also_known_as": anchor["also_known_as"],
+ }
+ }
+ }
+ return spec
+
+
+def should_skip(anchor_id):
+ """Check if anchor should be skipped."""
+ if anchor_id in SKIP_EXACT:
+ return True
+ for prefix in SKIP_PREFIXES:
+ if anchor_id.startswith(prefix) and anchor_id not in SKIP_EXACT:
+ return True
+ return False
+
+
+def main():
+ parser = argparse.ArgumentParser(description="Generate L1 evaluation specs from .adoc metadata")
+ parser.add_argument("--dry-run", action="store_true", help="Preview without writing files")
+ parser.add_argument("--anchor", help="Generate for a single anchor ID")
+ parser.add_argument("--force", action="store_true", help="Overwrite existing specs")
+ args = parser.parse_args()
+
+ # Parse all anchors
+ all_anchors = {}
+ for f in sorted(ANCHORS_DIR.glob("*.adoc")):
+ if f.stem.endswith(".de") or f.stem == "_template":
+ continue
+ anchor = parse_adoc(f)
+ all_anchors[anchor["id"]] = anchor
+
+ # Filter to Tier 3, skip sub-patterns
+ targets = []
+ for aid, anchor in all_anchors.items():
+ if args.anchor and aid != args.anchor:
+ continue
+ if anchor["tier"] != 3:
+ continue
+ if should_skip(aid):
+ continue
+ targets.append(anchor)
+
+ print(f"Found {len(targets)} Tier 3 anchors to process")
+
+ generated = 0
+ skipped = 0
+ for anchor in targets:
+ spec_file = SPECS_DIR / f"{anchor['id']}.yaml"
+
+ if spec_file.exists() and not args.force:
+ skipped += 1
+ continue
+
+ spec = generate_spec(anchor, all_anchors)
+ if not spec:
+ print(f" SKIP {anchor['id']}: no core concepts found")
+ continue
+
+ if args.dry_run:
+ print(f"\n--- {anchor['id']} ---")
+ print(yaml.dump(spec, default_flow_style=False, allow_unicode=True))
+ else:
+ SPECS_DIR.mkdir(parents=True, exist_ok=True)
+ with open(spec_file, "w", encoding="utf-8") as fh:
+ yaml.dump(spec, fh, default_flow_style=False, allow_unicode=True, sort_keys=False)
+ print(f" WROTE {spec_file.name}")
+ generated += 1
+
+ print(f"\nDone: {generated} generated, {skipped} skipped (already exist)")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/evaluations/generate-l2-specs.py b/evaluations/generate-l2-specs.py
new file mode 100644
index 0000000..554b119
--- /dev/null
+++ b/evaluations/generate-l2-specs.py
@@ -0,0 +1,147 @@
+#!/usr/bin/env python3
+"""
+Generate Level 2 (Application) questions for evaluation specs using Claude API.
+
+For each anchor that has a recognition question but no application question,
+generates a realistic scenario with anchor prompt, paraphrase, and MC options.
+
+Usage:
+ python3 evaluations/generate-l2-specs.py # Fill all missing L2
+ python3 evaluations/generate-l2-specs.py --dry-run # Preview
+ python3 evaluations/generate-l2-specs.py --anchor arc42 # Single anchor
+"""
+
+import argparse
+import json
+import os
+import sys
+from pathlib import Path
+
+try:
+ import yaml
+except ImportError:
+ print("PyYAML required: pip install pyyaml")
+ sys.exit(1)
+
+SPECS_DIR = Path(__file__).parent / "specs"
+ANCHORS_DIR = Path(__file__).parent.parent / "docs" / "anchors"
+
+SKIP_ANCHORS = {"sanity-check", "negative-control"}
+
+
+def load_anchor_context(anchor_id):
+ """Load anchor .adoc file for context."""
+ adoc = ANCHORS_DIR / f"{anchor_id}.adoc"
+ if adoc.exists():
+ return adoc.read_text(encoding="utf-8")[:2000]
+ return ""
+
+
+def needs_application(spec):
+ """Check if spec is missing an application question."""
+ return "application" not in spec.get("questions", {})
+
+
+def generate_application(spec):
+ """Use Claude API to generate an L2 Application question."""
+ try:
+ import anthropic
+ except ImportError:
+ print("anthropic package required: pip install anthropic")
+ sys.exit(1)
+
+ anchor_id = spec["anchor"]
+ title = spec["questions"]["recognition"]["question"].split('"')[1] if '"' in spec["questions"]["recognition"]["question"] else anchor_id
+ context = load_anchor_context(anchor_id)
+
+ prompt = f"""Generate a Level 2 Application multiple-choice question for the semantic anchor "{title}".
+
+The question tests whether an LLM can APPLY the methodology, not just describe it.
+
+Anchor definition (from .adoc file):
+{context}
+
+Requirements:
+1. Write a realistic SCENARIO (2-3 sentences) describing a concrete software engineering situation where this anchor applies.
+2. Write an ANCHOR_PROMPT — a short phrase like "using {title}" that would be added to the scenario.
+3. Write a PARAPHRASE_PROMPT — describes the GOAL without naming the methodology or hinting at the correct answer. Must be fair: not too specific (leaks answer) and not too vague.
+4. Write 4 OPTIONS (A, B, C, D) — one correct answer that reflects the methodology, three plausible alternatives.
+5. All options should be similar in length.
+6. The correct answer should reflect what a practitioner of this methodology would recommend.
+
+Return ONLY a JSON object with this exact structure:
+{{
+ "scenario": "...",
+ "anchor_prompt": "using {title}",
+ "paraphrase_prompt": "...",
+ "options": {{
+ "A": "...",
+ "B": "...",
+ "C": "...",
+ "D": "..."
+ }},
+ "correct": "B"
+}}
+
+Make B the correct answer. No explanation outside the JSON."""
+
+ client = anthropic.Anthropic()
+ response = client.messages.create(
+ model="claude-sonnet-4-20250514",
+ max_tokens=500,
+ temperature=0.7,
+ messages=[{"role": "user", "content": prompt}],
+ )
+
+ text = response.content[0].text.strip()
+ if "```" in text:
+ text = text.split("```")[1]
+ if text.startswith("json"):
+ text = text[4:]
+ text = text.strip()
+
+ return json.loads(text)
+
+
+def main():
+ parser = argparse.ArgumentParser(description="Generate L2 Application questions using Claude API")
+ parser.add_argument("--dry-run", action="store_true")
+ parser.add_argument("--anchor", help="Process single anchor")
+ args = parser.parse_args()
+
+ specs_to_fill = []
+ for f in sorted(SPECS_DIR.glob("*.yaml")):
+ spec = yaml.safe_load(f.read_text(encoding="utf-8"))
+ if spec["anchor"] in SKIP_ANCHORS:
+ continue
+ if args.anchor and spec["anchor"] != args.anchor:
+ continue
+ if needs_application(spec):
+ specs_to_fill.append((f, spec))
+
+ print(f"Found {len(specs_to_fill)} specs needing L2 Application questions")
+
+ for filepath, spec in specs_to_fill:
+ anchor_id = spec["anchor"]
+ print(f" {anchor_id}...", end=" ", flush=True)
+
+ if args.dry_run:
+ print("(dry run)")
+ continue
+
+ try:
+ app = generate_application(spec)
+ spec["questions"]["application"] = app
+
+ with open(filepath, "w", encoding="utf-8") as fh:
+ yaml.dump(spec, fh, default_flow_style=False, allow_unicode=True, sort_keys=False)
+ print("OK")
+
+ except Exception as e:
+ print(f"ERROR: {e}")
+
+ print("\nDone. Review the generated scenarios before running evaluations!")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/evaluations/generate-report.py b/evaluations/generate-report.py
new file mode 100644
index 0000000..a6be43e
--- /dev/null
+++ b/evaluations/generate-report.py
@@ -0,0 +1,294 @@
+#!/usr/bin/env python3
+"""
+Generate an HTML report from evaluation results.
+
+Reads all result JSON files and produces an interactive HTML report with:
+- Summary table (model × average score)
+- Heatmap (anchor × model)
+- Detail sections per anchor with raw responses
+- Controls (sanity check, negative control) shown separately
+
+Usage:
+ python3 evaluations/generate-report.py
+ python3 evaluations/generate-report.py --output evaluations/report.html
+"""
+
+import argparse
+from html import escape as h
+import json
+from collections import defaultdict
+from pathlib import Path
+
+RESULTS_DIR = Path(__file__).parent / "results"
+SPECS_DIR = Path(__file__).parent / "specs"
+
+# Models to include and display order
+MODEL_DISPLAY = {
+ "claude": "Claude Sonnet",
+ "claude-cli": "Claude Sonnet (CLI)",
+ "claude-haiku": "Claude Haiku",
+ "openai": "GPT-4o",
+ "mistral": "Mistral Large",
+ "ollama": "Ollama (local)",
+}
+
+CONTROL_ANCHORS = {"sanity-check", "negative-control"}
+
+
+def load_best_results():
+ """Load the latest result with the most questions per model."""
+ results = {}
+ for f in sorted(RESULTS_DIR.glob("pilot-*.json")):
+ d = json.load(open(f, encoding="utf-8"))
+ for m, r in d["models"].items():
+ if m not in results or len(r) >= len(results[m]["data"]):
+ results[m] = {
+ "data": r,
+ "file": f.name,
+ "config": d.get("config", {}),
+ "duration": d.get("duration_seconds", 0),
+ "timestamp": d.get("timestamp", ""),
+ }
+ return results
+
+
+def score_color(score):
+ if score >= 0.8:
+ return "#22c55e" # green
+ elif score >= 0.5:
+ return "#eab308" # yellow
+ else:
+ return "#ef4444" # red
+
+
+def score_bg(score):
+ if score >= 0.8:
+ return "#dcfce7"
+ elif score >= 0.5:
+ return "#fef9c3"
+ else:
+ return "#fee2e2"
+
+
+def generate_html(results, output_path):
+ # Collect all anchors and questions
+ all_questions = defaultdict(dict) # anchor/label -> {model: score}
+ model_names = []
+
+ # Prefer full runs (75 questions) over pilot runs
+ for m in ["claude", "openai", "mistral"]:
+ if m in results and len(results[m]["data"]) >= 60:
+ model_names.append(m)
+
+ # Add smaller runs if no full run exists
+ for m in ["claude-cli", "claude-haiku", "ollama"]:
+ if m in results and m not in model_names:
+ model_names.append(m)
+
+ for m in model_names:
+ for q in results[m]["data"]:
+ label = q["label"]
+ all_questions[label][m] = q["score"]
+
+ # Separate controls from anchors
+ anchor_questions = {k: v for k, v in all_questions.items()
+ if not any(k.startswith(c) for c in CONTROL_ANCHORS)}
+ control_questions = {k: v for k, v in all_questions.items()
+ if any(k.startswith(c) for c in CONTROL_ANCHORS)}
+
+ # Group by anchor
+ anchor_groups = defaultdict(list)
+ for label in sorted(anchor_questions.keys()):
+ anchor_id = label.split("/")[0]
+ anchor_groups[anchor_id].append(label)
+
+ # Model averages (excluding controls)
+ model_avgs = {}
+ for m in model_names:
+ scores = [anchor_questions[label].get(m) for label in anchor_questions
+ if anchor_questions[label].get(m) is not None]
+ model_avgs[m] = sum(scores) / len(scores) if scores else 0
+
+ html = f"""
+
+
+
+
+Semantic Anchor Evaluation Report
+
+
+
+Semantic Anchor Evaluation Report
+Multiple-choice recognition test across {len(model_names)} LLMs — {len(anchor_questions)} questions, {len(anchor_groups)} anchors
+
+
+
+Model Summary
+
+"""
+
+ for m in model_names:
+ avg = model_avgs.get(m, 0)
+ display = MODEL_DISPLAY.get(m, m)
+ n = len([1 for l in anchor_questions if anchor_questions[l].get(m) is not None])
+ info = results[m]
+ html += f"""
+
{display}
+
{avg:.0%}
+
{n} questions · {info['file']}
+
+"""
+
+ html += """
+
+Heatmap: Anchor × Model
+
+
+ | Anchor / Question |
+"""
+
+ for m in model_names:
+ html += f" {MODEL_DISPLAY.get(m, m)} | \n"
+ html += "
\n\n"
+
+ for anchor_id in sorted(anchor_groups.keys()):
+ labels = anchor_groups[anchor_id]
+ # Anchor group row with average
+ anchor_scores = {}
+ for m in model_names:
+ scores = [anchor_questions[l].get(m) for l in labels if anchor_questions[l].get(m) is not None]
+ anchor_scores[m] = sum(scores) / len(scores) if scores else None
+
+ html += f'| {h(anchor_id)} | '
+ for m in model_names:
+ s = anchor_scores.get(m)
+ if s is not None:
+ bg = score_bg(s)
+ text = "✓" if s == 1.0 else f"{s:.0%}"
+ html += f'{text} | '
+ else:
+ html += '— | '
+ html += "
\n"
+
+ # Individual question rows (only show if there are multiple or if score < 100%)
+ if len(labels) > 1:
+ for label in labels:
+ short = label.split("/", 1)[1] if "/" in label else label
+ html += f'| {h(short)} | '
+ for m in model_names:
+ s = anchor_questions[label].get(m)
+ if s is not None:
+ bg = score_bg(s)
+ text = "✓" if s == 1.0 else f"{s:.0%}"
+ html += f'{text} | '
+ else:
+ html += '— | '
+ html += "
\n"
+
+ html += "
\n"
+
+ # Controls section
+ if control_questions:
+ html += 'Control Questions
\n\n| Control | '
+ for m in model_names:
+ html += f"{MODEL_DISPLAY.get(m, m)} | "
+ html += "
\n\n"
+ for label in sorted(control_questions.keys()):
+ short = label.replace("/recognition", "")
+ html += f"| {short} | "
+ for m in model_names:
+ s = control_questions[label].get(m)
+ if s is not None:
+ bg = score_bg(s) if "sanity" not in label else ("#dcfce7" if s == 0 else "#fee2e2")
+ text = f"{s:.0%}"
+ html += f'{text} | '
+ else:
+ html += '— | '
+ html += "
\n"
+ html += "
\n"
+
+ # Failures detail
+ html += "Failures Detail
\n"
+ for m in model_names:
+ fails = [(q["label"], q["score"]) for q in results[m]["data"]
+ if q["score"] < 1.0 and not any(q["label"].startswith(c) for c in CONTROL_ANCHORS)]
+ if not fails:
+ html += f"{MODEL_DISPLAY.get(m, m)}: no failures
\n"
+ else:
+ html += f'{MODEL_DISPLAY.get(m, m)}: {len(fails)} failures
\n\n'
+ for label, score in sorted(fails):
+ html += f'
{h(label)}{score:.0%}
\n'
+ html += "
\n"
+
+ # Metadata
+ html += """
+
+
+
+"""
+
+ output_path.write_text(html, encoding="utf-8")
+ print(f"Report written to {output_path}")
+
+
+def main():
+ parser = argparse.ArgumentParser(description="Generate HTML evaluation report")
+ parser.add_argument("--output", default="evaluations/report.html",
+ help="Output HTML file (default: evaluations/report.html)")
+ args = parser.parse_args()
+
+ results = load_best_results()
+ print(f"Loaded results for {len(results)} models")
+ for m, info in results.items():
+ print(f" {m}: {len(info['data'])} questions from {info['file']}")
+
+ generate_html(results, Path(args.output))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/evaluations/pilot.py b/evaluations/pilot.py
new file mode 100644
index 0000000..c921ad0
--- /dev/null
+++ b/evaluations/pilot.py
@@ -0,0 +1,502 @@
+#!/usr/bin/env python3
+"""
+Pilot evaluation runner for semantic anchor multiple-choice tests.
+Reads YAML specs, sends questions to LLMs, scores responses.
+
+Usage:
+ python3 pilot.py --model claude # Claude Sonnet via Anthropic API
+ python3 pilot.py --model ollama # Local model via Ollama (OpenAI-compatible)
+ python3 pilot.py --model claude ollama # Both
+ python3 pilot.py --dry-run # Show prompts without sending
+"""
+
+import argparse
+import json
+import os
+import random
+import sys
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+
+try:
+ import yaml
+except ImportError:
+ print("PyYAML required: pip install pyyaml")
+ sys.exit(1)
+
+SPECS_DIR = Path(__file__).parent / "specs"
+RESULTS_DIR = Path(__file__).parent / "results"
+POSITION_PERMUTATIONS = [
+ [0, 1, 2, 3], # A B C D (original)
+ [1, 2, 3, 0], # B C D A
+ [2, 3, 0, 1], # C D A B
+ [3, 0, 1, 2], # D A B C
+]
+LETTERS = ["A", "B", "C", "D"]
+
+
+def load_specs():
+ specs = []
+ for f in sorted(SPECS_DIR.glob("*.yaml")):
+ with open(f, encoding="utf-8") as fh:
+ specs.append(yaml.safe_load(fh))
+ return specs
+
+
+def build_prompt(question_text, options, permutation):
+ """Build a prompt with options in the given permutation order."""
+ lines = [question_text.strip(), ""]
+ for i, perm_idx in enumerate(permutation):
+ letter = LETTERS[i]
+ option_text = options[LETTERS[perm_idx]]
+ lines.append(f"{letter}) {option_text}")
+ lines.append("")
+ lines.append("Answer with the letter only.")
+ return "\n".join(lines)
+
+
+def correct_letter_for_permutation(original_correct, permutation):
+ """Find which letter the original correct answer maps to in this permutation.
+ Returns 'X' for sanity checks (no correct answer exists)."""
+ if original_correct == "X":
+ return "X"
+ original_idx = LETTERS.index(original_correct)
+ for i, perm_idx in enumerate(permutation):
+ if perm_idx == original_idx:
+ return LETTERS[i]
+ return None
+
+
+def parse_response(text):
+ """Extract the first capital letter A-D from the response.
+ Strips ... blocks (used by reasoning models like qwen3)."""
+ import re
+ # Remove thinking blocks (qwen3, DeepSeek R1, etc.)
+ cleaned = re.sub(r'.*?', '', text, flags=re.DOTALL).strip()
+ # If nothing left after stripping, fall back to original
+ if not cleaned:
+ cleaned = text.strip()
+ # Try to find a standalone answer letter (e.g., "B", "B)", "**B**", "b")
+ # First: look for a line that is just a letter (strongest signal)
+ for line in cleaned.split('\n'):
+ line = line.strip().strip('*').strip('.').strip(')').strip()
+ if line.upper() in ("A", "B", "C", "D"):
+ return line.upper()
+ # Fallback: first capital A-D in the text
+ for char in cleaned:
+ if char in "ABCD":
+ return char
+ return None
+
+
+# Global temperature — set via --temperature flag
+TEMPERATURE = 0.0
+
+
+def set_temperature(t):
+ global TEMPERATURE
+ TEMPERATURE = t
+
+
+def call_claude_api(prompt, model="claude-sonnet-4-20250514"):
+ """Send prompt to Claude via Anthropic API."""
+ try:
+ import anthropic
+ except ImportError:
+ print("anthropic package required: pip install anthropic")
+ sys.exit(1)
+
+ client = anthropic.Anthropic()
+ response = client.messages.create(
+ model=model,
+ max_tokens=10,
+ temperature=TEMPERATURE,
+ messages=[{"role": "user", "content": prompt}],
+ )
+ return response.content[0].text, model
+
+
+def call_claude_cli(prompt, model="claude-cli"):
+ """Send prompt to Claude Sonnet via claude -p CLI.
+ Note: temperature cannot be controlled via CLI."""
+ import subprocess
+ result = subprocess.run(
+ ["claude", "-p", prompt],
+ capture_output=True, text=True, timeout=60,
+ )
+ if result.returncode != 0:
+ return f"ERROR: {result.stderr.strip()}", model
+ return result.stdout.strip(), model
+
+
+def call_claude_haiku(prompt, model="claude-haiku"):
+ """Send prompt to Claude Haiku via claude -p CLI.
+ Note: temperature cannot be controlled via CLI."""
+ import subprocess
+ result = subprocess.run(
+ ["claude", "-p", prompt, "--model", "haiku"],
+ capture_output=True, text=True, timeout=60,
+ )
+ if result.returncode != 0:
+ return f"ERROR: {result.stderr.strip()}", model
+ return result.stdout.strip(), model
+
+
+def make_openai_caller(openai_model):
+ """Create an OpenAI caller for a specific model."""
+ def call_openai(prompt, model=openai_model):
+ try:
+ import openai
+ except ImportError:
+ print("openai package required: pip install openai")
+ sys.exit(1)
+
+ client = openai.OpenAI()
+ # GPT-5+ and reasoning models require different parameters
+ is_new_api = any(x in model for x in ("gpt-5", "o3", "o4"))
+ kwargs = {"model": model, "messages": [{"role": "user", "content": prompt}]}
+ if is_new_api:
+ kwargs["max_completion_tokens"] = 2048
+ # GPT-5 only supports temperature=1
+ else:
+ kwargs["max_tokens"] = 10
+ kwargs["temperature"] = TEMPERATURE
+ response = client.chat.completions.create(**kwargs)
+ return response.choices[0].message.content.strip(), model
+ return call_openai
+
+
+def make_mistral_caller(mistral_model):
+ """Create a Mistral caller via OpenAI-compatible API."""
+ def call_mistral(prompt, model=mistral_model):
+ try:
+ import openai
+ except ImportError:
+ print("openai package required: pip install openai")
+ sys.exit(1)
+
+ client = openai.OpenAI(
+ base_url="https://api.mistral.ai/v1",
+ api_key=os.environ.get("MISTRAL_API_KEY", ""),
+ )
+ response = client.chat.completions.create(
+ model=model,
+ max_tokens=10,
+ temperature=TEMPERATURE,
+ messages=[{"role": "user", "content": prompt}],
+ )
+ return response.choices[0].message.content.strip(), model
+ return call_mistral
+
+
+def make_deepseek_caller(deepseek_model):
+ """Create a DeepSeek caller via OpenAI-compatible API."""
+ def call_deepseek(prompt, model=deepseek_model):
+ try:
+ import openai
+ except ImportError:
+ print("openai package required: pip install openai")
+ sys.exit(1)
+
+ client = openai.OpenAI(
+ base_url="https://api.deepseek.com",
+ api_key=os.environ.get("DEEPSEEK_API_KEY", ""),
+ )
+ response = client.chat.completions.create(
+ model=model,
+ max_tokens=10,
+ temperature=TEMPERATURE,
+ messages=[{"role": "user", "content": prompt}],
+ )
+ return response.choices[0].message.content.strip(), model
+ return call_deepseek
+
+
+def make_ollama_caller(ollama_model, no_think=False, base_url="http://localhost:11434"):
+ """Create an Ollama caller for a specific model."""
+ def call_ollama(prompt, model=ollama_model):
+ import urllib.request
+
+ body = {
+ "model": model,
+ "messages": [{"role": "user", "content": prompt}],
+ "stream": False,
+ "options": {"temperature": TEMPERATURE},
+ }
+ if no_think:
+ body["think"] = False
+
+ data = json.dumps(body).encode("utf-8")
+ req = urllib.request.Request(
+ f"{base_url}/api/chat",
+ data=data,
+ headers={"Content-Type": "application/json"},
+ )
+ with urllib.request.urlopen(req, timeout=300) as resp:
+ result = json.loads(resp.read())
+
+ content = result.get("message", {}).get("content", "")
+ return content, f"ollama/{model}"
+ return call_ollama
+
+
+def run_question(question_data, call_fn, label, context="", verbose=False):
+ """Run a single question 4x with randomized positions. Returns results."""
+ question_text = question_data["question"]
+ if context:
+ question_text = f"{context}\n{question_text}"
+ options = question_data["options"]
+ original_correct = question_data["correct"]
+ results = []
+ for i, perm in enumerate(POSITION_PERMUTATIONS):
+ prompt = build_prompt(question_text, options, perm)
+ expected = correct_letter_for_permutation(original_correct, perm)
+
+ try:
+ response_text, model_id = call_fn(prompt)
+ except Exception as e:
+ response_text = f"ERROR: {e}"
+ if verbose:
+ print(f"\n [ERROR] {e}")
+
+ answer = parse_response(response_text)
+ correct = answer == expected
+
+ if verbose and i == 0: # show first permutation only
+ print(f"\n [RAW] expected={expected} parsed={answer} response={repr(response_text[:200])}")
+
+ results.append({
+ "permutation": [LETTERS[p] for p in perm],
+ "expected": expected,
+ "answer": answer,
+ "correct": correct,
+ "raw_response": response_text.strip()[:500],
+ })
+ time.sleep(0.5) # rate limiting
+
+ score = sum(1 for r in results if r["correct"]) / len(results)
+ return {
+ "label": label,
+ "score": score,
+ "results": results,
+ }
+
+
+def save_results(all_results, out_file):
+ """Save results incrementally after each question."""
+ RESULTS_DIR.mkdir(parents=True, exist_ok=True)
+ with open(out_file, "w", encoding="utf-8") as fh:
+ json.dump(all_results, fh, indent=2, ensure_ascii=False)
+
+
+def run_pilot(models, dry_run=False, verbose=False, ollama_model="qwen3:4b", no_think=False,
+ ollama_url="http://localhost:11434", openai_model="gpt-4o-mini",
+ mistral_model="mistral-large-latest", deepseek_model="deepseek-chat"):
+ start_time = time.time()
+ specs = load_specs()
+ print(f"Loaded {len(specs)} anchor specs")
+ print(f"Models: {', '.join(models)}")
+ print(f"Temperature: {TEMPERATURE}")
+ if "openai" in models:
+ print(f"OpenAI model: {openai_model}")
+ if "mistral" in models:
+ print(f"Mistral model: {mistral_model}")
+ if "deepseek" in models:
+ print(f"DeepSeek model: {deepseek_model}")
+ if "ollama" in models:
+ print(f"Ollama model: {ollama_model}")
+ print(f"Ollama URL: {ollama_url}")
+ print(f"No-think: {no_think}")
+ print(f"Dry run: {dry_run}")
+ print()
+
+ ts = datetime.now().strftime("%Y%m%d-%H%M%S")
+ out_file = RESULTS_DIR / f"pilot-{ts}.json"
+
+ all_results = {
+ "timestamp": datetime.now(timezone.utc).isoformat(),
+ "config": {
+ "models": models,
+ "openai_model": openai_model if "openai" in models else None,
+ "mistral_model": mistral_model if "mistral" in models else None,
+ "deepseek_model": deepseek_model if "deepseek" in models else None,
+ "ollama_model": ollama_model if "ollama" in models else None,
+ "ollama_url": ollama_url if "ollama" in models else None,
+ "no_think": no_think if "ollama" in models else None,
+ "temperature": TEMPERATURE,
+ },
+ "models": {},
+ }
+
+ for model_name in models:
+ if model_name == "claude":
+ call_fn = call_claude_api
+ elif model_name == "claude-cli":
+ call_fn = call_claude_cli
+ elif model_name == "claude-haiku":
+ call_fn = call_claude_haiku
+ elif model_name == "openai":
+ call_fn = make_openai_caller(openai_model)
+ elif model_name == "mistral":
+ call_fn = make_mistral_caller(mistral_model)
+ elif model_name == "deepseek":
+ call_fn = make_deepseek_caller(deepseek_model)
+ elif model_name == "ollama":
+ call_fn = make_ollama_caller(ollama_model, no_think=no_think, base_url=ollama_url)
+ else:
+ print(f"Unknown model: {model_name}")
+ continue
+
+ # Count total questions for progress display
+ total_q = 0
+ for spec in specs:
+ questions = spec.get("questions", {})
+ if "recognition" in questions: total_q += 1
+ if "application" in questions: total_q += 2 # anchor + paraphrase
+ if "consistency" in questions:
+ cons = questions["consistency"]
+ total_q += len(cons.get("variants", []))
+ if cons.get("language_variant"): total_q += 1
+
+ print(f"=== {model_name.upper()} ({total_q} questions) ===")
+ model_results = []
+ all_results["models"][model_name] = model_results
+ current_q = [0]
+
+ def append_and_save(r):
+ model_results.append(r)
+ current_q[0] += 1
+ if not dry_run:
+ save_results(all_results, out_file)
+
+ for spec in specs:
+ anchor = spec["anchor"]
+ questions = spec.get("questions", {})
+
+ # Level 1: Recognition
+ if "recognition" in questions:
+ q = questions["recognition"]
+ if dry_run:
+ prompt = build_prompt(q["question"], q["options"], POSITION_PERMUTATIONS[0])
+ print(f"\n[DRY RUN] {anchor} / recognition:")
+ print(prompt)
+ else:
+ print(f" [{current_q[0]+1}/{total_q}] {anchor} / recognition...", end=" ", flush=True)
+ result = run_question(q, call_fn, f"{anchor}/recognition", verbose=verbose)
+ print(f"{result['score']:.0%}")
+ append_and_save(result)
+
+ # Level 2: Application (anchor variant)
+ if "application" in questions:
+ app = questions["application"]
+ anchor_q = {
+ "question": f"{app['scenario'].strip()}\n{app['anchor_prompt']}",
+ "options": app["options"],
+ "correct": app["correct"],
+ }
+ para_q = {
+ "question": f"{app['scenario'].strip()}\n{app['paraphrase_prompt']}",
+ "options": app["options"],
+ "correct": app["correct"],
+ }
+ if dry_run:
+ prompt = build_prompt(anchor_q["question"], anchor_q["options"], POSITION_PERMUTATIONS[0])
+ print(f"\n[DRY RUN] {anchor} / application (anchor):")
+ print(prompt)
+ else:
+ print(f" [{current_q[0]+1}/{total_q}] {anchor} / application (anchor)...", end=" ", flush=True)
+ result_a = run_question(anchor_q, call_fn, f"{anchor}/application-anchor", verbose=verbose)
+ print(f"{result_a['score']:.0%}")
+ append_and_save(result_a)
+
+ print(f" [{current_q[0]+1}/{total_q}] {anchor} / application (paraphrase)...", end=" ", flush=True)
+ result_p = run_question(para_q, call_fn, f"{anchor}/application-paraphrase", verbose=verbose)
+ print(f"{result_p['score']:.0%}")
+ append_and_save(result_p)
+
+ # Level 4: Consistency
+ if "consistency" in questions:
+ cons = questions["consistency"]
+ variants = cons.get("variants", [])
+ lang = cons.get("language_variant")
+ if lang:
+ variants = variants + [lang]
+
+ for i, variant in enumerate(variants):
+ variant_q = {
+ "question": variant,
+ "options": cons["options"],
+ "correct": cons["correct"],
+ }
+ variant_label = f"variant-{i+1}" if i < len(cons.get("variants", [])) else "language"
+ if dry_run:
+ prompt = build_prompt(variant_q["question"], variant_q["options"], POSITION_PERMUTATIONS[0])
+ print(f"\n[DRY RUN] {anchor} / consistency ({variant_label}):")
+ print(prompt)
+ else:
+ print(f" [{current_q[0]+1}/{total_q}] {anchor} / consistency ({variant_label})...", end=" ", flush=True)
+ result = run_question(variant_q, call_fn, f"{anchor}/consistency-{variant_label}", verbose=verbose)
+ print(f"{result['score']:.0%}")
+ append_and_save(result)
+
+ all_results["models"][model_name] = model_results
+
+ elapsed = time.time() - start_time
+ all_results["duration_seconds"] = round(elapsed, 1)
+
+ if not dry_run:
+ save_results(all_results, out_file)
+ print(f"\nResults saved to {out_file}")
+
+ # Summary
+ print("\n=== SUMMARY ===")
+ print(f"Models: {', '.join(models)}")
+ print(f"Temperature: {TEMPERATURE}")
+ if "openai" in models:
+ print(f"OpenAI: {openai_model}")
+ if "mistral" in models:
+ print(f"Mistral: {mistral_model}")
+ if "deepseek" in models:
+ print(f"DeepSeek: {deepseek_model}")
+ if "ollama" in models:
+ print(f"Ollama: {ollama_model} @ {ollama_url} (no-think={no_think})")
+ minutes, seconds = divmod(int(elapsed), 60)
+ print(f"Duration: {minutes}m {seconds}s")
+ print()
+ for model_name, results in all_results["models"].items():
+ scores = [r["score"] for r in results]
+ avg = sum(scores) / len(scores) if scores else 0
+ print(f"{model_name}: {avg:.0%} average ({len(scores)} questions)")
+ for r in results:
+ status = "✓" if r["score"] == 1.0 else f"{r['score']:.0%}"
+ print(f" {r['label']}: {status}")
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Pilot evaluation runner")
+ parser.add_argument("--model", nargs="+", default=["claude-cli"],
+ choices=["claude", "claude-cli", "claude-haiku", "openai", "mistral", "deepseek", "ollama"],
+ help="Models to evaluate (default: claude-cli)")
+ parser.add_argument("--openai-model", default="gpt-4o-mini",
+ help="OpenAI model name (default: gpt-4o-mini). Try: gpt-5, gpt-5-mini, gpt-4o")
+ parser.add_argument("--mistral-model", default="mistral-large-latest",
+ help="Mistral model name (default: mistral-large-latest)")
+ parser.add_argument("--deepseek-model", default="deepseek-chat",
+ help="DeepSeek model name (default: deepseek-chat)")
+ parser.add_argument("--ollama-model", default="qwen3:4b",
+ help="Ollama model name (default: qwen3:4b)")
+ parser.add_argument("--ollama-url", default="http://localhost:11434",
+ help="Ollama API base URL (default: http://localhost:11434)")
+ parser.add_argument("--temperature", type=float, default=0.0,
+ help="Sampling temperature (default: 0.0). Note: claude-cli/claude-haiku ignore this.")
+ parser.add_argument("--no-think", action="store_true",
+ help="Disable reasoning/thinking for Ollama models (faster, fewer tokens)")
+ parser.add_argument("--dry-run", action="store_true",
+ help="Show prompts without sending")
+ parser.add_argument("--verbose", action="store_true",
+ help="Print raw responses for debugging")
+ args = parser.parse_args()
+ set_temperature(args.temperature)
+ run_pilot(args.model, args.dry_run, args.verbose, args.ollama_model, args.no_think,
+ args.ollama_url, args.openai_model, args.mistral_model, args.deepseek_model)
diff --git a/evaluations/report.html b/evaluations/report.html
new file mode 100644
index 0000000..0fd9fb2
--- /dev/null
+++ b/evaluations/report.html
@@ -0,0 +1,388 @@
+
+
+
+
+
+Semantic Anchor Evaluation Report
+
+
+
+Semantic Anchor Evaluation Report
+Multiple-choice recognition test across 3 LLMs — 191 questions, 61 anchors
+
+
+
+Model Summary
+
+
+
Claude Sonnet
+
99%
+
191 questions · pilot-20260324-174404.json
+
+
+
GPT-4o
+
98%
+
191 questions · pilot-20260324-192413.json
+
+
+
Mistral Large
+
96%
+
191 questions · pilot-20260324-190600.json
+
+
+
+Heatmap: Anchor × Model
+
+
+ | Anchor / Question |
+ Claude Sonnet |
+ GPT-4o |
+ Mistral Large |
+
+
+| adr-according-to-nygard | ✓ | ✓ | 92% |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | 75% |
+| arc42 | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| consistency-language | ✓ | ✓ | ✓ |
+| consistency-variant-1 | ✓ | ✓ | ✓ |
+| consistency-variant-2 | ✓ | ✓ | ✓ |
+| consistency-variant-3 | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| atam | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| bdd-given-when-then | ✓ | ✓ | 83% |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | 50% |
+| recognition | ✓ | ✓ | ✓ |
+| bem-methodology | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| bluf | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| c4-diagrams | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| chain-of-thought | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| clean-architecture | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| control-chart-shewhart | ✓ | 92% | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | 75% | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| conventional-commits | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| cqrs | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| cynefin-framework | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| definition-of-done | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| devils-advocate | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| diataxis-framework | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| docs-as-code | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| domain-driven-design | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| ears-requirements | ✓ | 92% | 83% |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | 75% | 75% |
+| recognition | ✓ | ✓ | 75% |
+| event-driven-architecture | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| fagan-inspection | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| feynman-technique | 67% | 67% | 92% |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | 0% | 0% | 75% |
+| recognition | ✓ | ✓ | ✓ |
+| five-whys | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| fowler-patterns | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| gherkin | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| github-flow | 92% | 92% | 92% |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | 75% | 75% | 75% |
+| recognition | ✓ | ✓ | ✓ |
+| gutes-deutsch-wolf-schneider | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| hexagonal-architecture | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| iec-61508-sil-levels | ✓ | 92% | 83% |
+| application-anchor | ✓ | ✓ | 50% |
+| application-paraphrase | ✓ | 75% | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| impact-mapping | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| invest | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| iso-25010 | ✓ | ✓ | 83% |
+| application-anchor | ✓ | ✓ | 75% |
+| application-paraphrase | ✓ | ✓ | 75% |
+| recognition | ✓ | ✓ | ✓ |
+| jobs-to-be-done | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| lasr | ✓ | 92% | 75% |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | 75% | 25% |
+| linddun | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| llm-evaluations | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| madr | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| mece | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| morphological-box | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| moscow | ✓ | 92% | 75% |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | 75% | 25% |
+| recognition | ✓ | ✓ | ✓ |
+| mutation-testing | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| nelson-rules | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| owasp-top-10 | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| plain-english-strunk-white | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| prd | ✓ | 92% | 67% |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | 75% | 0% |
+| problem-space-nvc | ✓ | ✓ | 83% |
+| application-anchor | ✓ | ✓ | 75% |
+| application-paraphrase | ✓ | ✓ | 75% |
+| recognition | ✓ | ✓ | ✓ |
+| property-based-testing | ✓ | 83% | ✓ |
+| application-anchor | ✓ | 75% | ✓ |
+| application-paraphrase | ✓ | 75% | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| pyramid-principle | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| semantic-versioning | ✓ | ✓ | 75% |
+| application-anchor | ✓ | ✓ | 50% |
+| application-paraphrase | ✓ | ✓ | 75% |
+| recognition | ✓ | ✓ | ✓ |
+| socratic-method | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| sota | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| spc | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| stride | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| swot | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| tdd-chicago-school | ✓ | 92% | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | 75% | ✓ |
+| tdd-london-school | ✓ | 89% | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| consistency-language | ✓ | ✓ | ✓ |
+| consistency-variant-1 | ✓ | ✓ | ✓ |
+| consistency-variant-2 | ✓ | ✓ | ✓ |
+| consistency-variant-3 | ✓ | 50% | ✓ |
+| recognition | ✓ | 75% | ✓ |
+| testing-pyramid | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| timtowtdi | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| todotxt-flavoured-markdown | ✓ | ✓ | 83% |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | 50% |
+| user-story-mapping | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+| wardley-mapping | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ |
+
+Control Questions
+
+| Control | Claude Sonnet | GPT-4o | Mistral Large |
+
+| negative-control | 100% | 100% | 75% |
+| sanity-check | 0% | 0% | 0% |
+
+Failures Detail
+Claude Sonnet: 2 failures
+
+
feynman-technique/application-paraphrase0%
+
github-flow/application-paraphrase75%
+
+GPT-4o: 13 failures
+
+
control-chart-shewhart/application-paraphrase75%
+
ears-requirements/application-paraphrase75%
+
feynman-technique/application-paraphrase0%
+
github-flow/application-paraphrase75%
+
iec-61508-sil-levels/application-paraphrase75%
+
lasr/recognition75%
+
moscow/application-paraphrase75%
+
prd/recognition75%
+
property-based-testing/application-anchor75%
+
property-based-testing/application-paraphrase75%
+
tdd-chicago-school/recognition75%
+
tdd-london-school/consistency-variant-350%
+
tdd-london-school/recognition75%
+
+Mistral Large: 17 failures
+
+
adr-according-to-nygard/recognition75%
+
bdd-given-when-then/application-paraphrase50%
+
ears-requirements/application-paraphrase75%
+
ears-requirements/recognition75%
+
feynman-technique/application-paraphrase75%
+
github-flow/application-paraphrase75%
+
iec-61508-sil-levels/application-anchor50%
+
iso-25010/application-anchor75%
+
iso-25010/application-paraphrase75%
+
lasr/recognition25%
+
moscow/application-paraphrase25%
+
prd/recognition0%
+
problem-space-nvc/application-anchor75%
+
problem-space-nvc/application-paraphrase75%
+
semantic-versioning/application-anchor50%
+
semantic-versioning/application-paraphrase75%
+
todotxt-flavoured-markdown/recognition50%
+
+
+
+
+
+
\ No newline at end of file
diff --git a/evaluations/results/pilot-20260324-174404.json b/evaluations/results/pilot-20260324-174404.json
new file mode 100644
index 0000000..110f33a
--- /dev/null
+++ b/evaluations/results/pilot-20260324-174404.json
@@ -0,0 +1,10442 @@
+{
+ "timestamp": "2026-03-24T17:44:04.891380+00:00",
+ "config": {
+ "models": [
+ "claude"
+ ],
+ "openai_model": null,
+ "mistral_model": null,
+ "deepseek_model": null,
+ "ollama_model": null,
+ "ollama_url": null,
+ "no_think": null,
+ "temperature": 0.0
+ },
+ "models": {
+ "claude": [
+ {
+ "label": "adr-according-to-nygard/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "adr-according-to-nygard/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "adr-according-to-nygard/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "arc42/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ }
+ ]
+ },
+ {
+ "label": "arc42/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "arc42/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "arc42/consistency-variant-1",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "arc42/consistency-variant-2",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "arc42/consistency-variant-3",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "arc42/consistency-language",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "atam/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "atam/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "atam/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "bdd-given-when-then/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "bdd-given-when-then/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "bdd-given-when-then/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "bem-methodology/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "bem-methodology/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "bem-methodology/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "bluf/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "bluf/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "bluf/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "c4-diagrams/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "c4-diagrams/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "c4-diagrams/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "chain-of-thought/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "chain-of-thought/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "chain-of-thought/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "clean-architecture/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "clean-architecture/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "clean-architecture/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "control-chart-shewhart/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "control-chart-shewhart/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "control-chart-shewhart/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "conventional-commits/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "conventional-commits/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "conventional-commits/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "cqrs/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "cqrs/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "cqrs/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "cynefin-framework/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "cynefin-framework/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "cynefin-framework/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "definition-of-done/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "definition-of-done/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "definition-of-done/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "devils-advocate/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "devils-advocate/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "devils-advocate/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "diataxis-framework/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "diataxis-framework/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "diataxis-framework/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "docs-as-code/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "docs-as-code/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "docs-as-code/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "domain-driven-design/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "domain-driven-design/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "domain-driven-design/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "ears-requirements/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "ears-requirements/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "ears-requirements/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "event-driven-architecture/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "event-driven-architecture/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "event-driven-architecture/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "fagan-inspection/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "fagan-inspection/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "fagan-inspection/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "feynman-technique/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "feynman-technique/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "feynman-technique/application-paraphrase",
+ "score": 0.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "C",
+ "correct": false,
+ "raw_response": "C"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "B",
+ "correct": false,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "A",
+ "correct": false,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "D",
+ "correct": false,
+ "raw_response": "D"
+ }
+ ]
+ },
+ {
+ "label": "five-whys/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "five-whys/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "five-whys/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "fowler-patterns/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "fowler-patterns/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "fowler-patterns/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "gherkin/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ }
+ ]
+ },
+ {
+ "label": "gherkin/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "gherkin/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "github-flow/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "github-flow/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "github-flow/application-paraphrase",
+ "score": 0.75,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "A",
+ "correct": false,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "gutes-deutsch-wolf-schneider/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "gutes-deutsch-wolf-schneider/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "gutes-deutsch-wolf-schneider/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "hexagonal-architecture/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "hexagonal-architecture/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "hexagonal-architecture/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "iec-61508-sil-levels/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "iec-61508-sil-levels/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "iec-61508-sil-levels/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "impact-mapping/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "impact-mapping/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "impact-mapping/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "invest/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "invest/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "invest/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "iso-25010/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "iso-25010/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "iso-25010/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "jobs-to-be-done/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "jobs-to-be-done/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "jobs-to-be-done/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "lasr/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "lasr/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "lasr/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "linddun/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "linddun/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "linddun/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "llm-evaluations/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "llm-evaluations/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "llm-evaluations/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "madr/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "madr/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "madr/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "mece/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "mece/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "mece/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "morphological-box/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "morphological-box/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "morphological-box/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "moscow/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "moscow/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "moscow/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "mutation-testing/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "mutation-testing/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "mutation-testing/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "negative-control/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ }
+ ]
+ },
+ {
+ "label": "nelson-rules/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "nelson-rules/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "nelson-rules/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "owasp-top-10/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ }
+ ]
+ },
+ {
+ "label": "owasp-top-10/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "owasp-top-10/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "plain-english-strunk-white/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "plain-english-strunk-white/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "plain-english-strunk-white/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "prd/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "prd/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "prd/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "problem-space-nvc/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "problem-space-nvc/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "problem-space-nvc/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "property-based-testing/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "property-based-testing/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "property-based-testing/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "pyramid-principle/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "pyramid-principle/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "pyramid-principle/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "sanity-check/recognition",
+ "score": 0.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "X",
+ "answer": "B",
+ "correct": false,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "X",
+ "answer": null,
+ "correct": false,
+ "raw_response": "None of the options provided match the famous answer from"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "X",
+ "answer": null,
+ "correct": false,
+ "raw_response": "None of the options provided match the correct answer from"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "X",
+ "answer": null,
+ "correct": false,
+ "raw_response": "None of the options provided match the famous answer from"
+ }
+ ]
+ },
+ {
+ "label": "semantic-versioning/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "semantic-versioning/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "semantic-versioning/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "socratic-method/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "socratic-method/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "socratic-method/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "sota/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "sota/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "sota/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "spc/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "spc/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "spc/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "stride/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "stride/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "stride/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "swot/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "swot/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "swot/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "tdd-chicago-school/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "tdd-chicago-school/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "tdd-chicago-school/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "tdd-london-school/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "tdd-london-school/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "tdd-london-school/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "tdd-london-school/consistency-variant-1",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "tdd-london-school/consistency-variant-2",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "tdd-london-school/consistency-variant-3",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "tdd-london-school/consistency-language",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "testing-pyramid/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "testing-pyramid/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "testing-pyramid/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "timtowtdi/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ }
+ ]
+ },
+ {
+ "label": "timtowtdi/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "timtowtdi/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "todotxt-flavoured-markdown/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "todotxt-flavoured-markdown/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "todotxt-flavoured-markdown/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "user-story-mapping/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "user-story-mapping/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "user-story-mapping/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "wardley-mapping/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "wardley-mapping/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "wardley-mapping/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ }
+ ]
+ },
+ "duration_seconds": 4862.1
+}
\ No newline at end of file
diff --git a/evaluations/results/pilot-20260324-190600.json b/evaluations/results/pilot-20260324-190600.json
new file mode 100644
index 0000000..613af48
--- /dev/null
+++ b/evaluations/results/pilot-20260324-190600.json
@@ -0,0 +1,10442 @@
+{
+ "timestamp": "2026-03-24T19:06:00.394684+00:00",
+ "config": {
+ "models": [
+ "mistral"
+ ],
+ "openai_model": null,
+ "mistral_model": "mistral-large-latest",
+ "deepseek_model": null,
+ "ollama_model": null,
+ "ollama_url": null,
+ "no_think": null,
+ "temperature": 0.0
+ },
+ "models": {
+ "mistral": [
+ {
+ "label": "adr-according-to-nygard/recognition",
+ "score": 0.75,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "A",
+ "correct": false,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "adr-according-to-nygard/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "adr-according-to-nygard/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "arc42/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ }
+ ]
+ },
+ {
+ "label": "arc42/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "arc42/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "arc42/consistency-variant-1",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "arc42/consistency-variant-2",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "arc42/consistency-variant-3",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "arc42/consistency-language",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "atam/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "atam/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "atam/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "bdd-given-when-then/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "bdd-given-when-then/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "bdd-given-when-then/application-paraphrase",
+ "score": 0.5,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": null,
+ "correct": false,
+ "raw_response": "ERROR: Error code: 429 - {'object': 'error', 'message': 'Rate limit exceeded', 'type': 'rate_limited', 'param': None, 'code': '1300', 'raw_status_code': 429}"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": null,
+ "correct": false,
+ "raw_response": "ERROR: Error code: 429 - {'object': 'error', 'message': 'Rate limit exceeded', 'type': 'rate_limited', 'param': None, 'code': '1300', 'raw_status_code': 429}"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "bem-methodology/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "bem-methodology/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "bem-methodology/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "bluf/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "bluf/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "bluf/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "c4-diagrams/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "c4-diagrams/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "c4-diagrams/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "chain-of-thought/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "chain-of-thought/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "chain-of-thought/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "clean-architecture/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "clean-architecture/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "clean-architecture/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "control-chart-shewhart/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "control-chart-shewhart/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "control-chart-shewhart/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "conventional-commits/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "conventional-commits/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "conventional-commits/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "cqrs/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "cqrs/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "cqrs/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "cynefin-framework/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "cynefin-framework/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "cynefin-framework/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "definition-of-done/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "definition-of-done/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "definition-of-done/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "devils-advocate/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "devils-advocate/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "devils-advocate/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "diataxis-framework/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "diataxis-framework/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "diataxis-framework/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "docs-as-code/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "docs-as-code/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "docs-as-code/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "domain-driven-design/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "domain-driven-design/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "domain-driven-design/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "ears-requirements/recognition",
+ "score": 0.75,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "C",
+ "correct": false,
+ "raw_response": "C)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "ears-requirements/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "ears-requirements/application-paraphrase",
+ "score": 0.75,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "B",
+ "correct": false,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "event-driven-architecture/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "event-driven-architecture/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "event-driven-architecture/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "fagan-inspection/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "fagan-inspection/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "fagan-inspection/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "feynman-technique/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "feynman-technique/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "feynman-technique/application-paraphrase",
+ "score": 0.75,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "B",
+ "correct": false,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "five-whys/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "five-whys/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "five-whys/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "fowler-patterns/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "fowler-patterns/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "fowler-patterns/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "gherkin/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ }
+ ]
+ },
+ {
+ "label": "gherkin/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "gherkin/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "github-flow/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "github-flow/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "github-flow/application-paraphrase",
+ "score": 0.75,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "A",
+ "correct": false,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "gutes-deutsch-wolf-schneider/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "gutes-deutsch-wolf-schneider/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "gutes-deutsch-wolf-schneider/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "hexagonal-architecture/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "hexagonal-architecture/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "hexagonal-architecture/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "iec-61508-sil-levels/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "iec-61508-sil-levels/application-anchor",
+ "score": 0.5,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "B",
+ "correct": false,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "D",
+ "correct": false,
+ "raw_response": "D"
+ }
+ ]
+ },
+ {
+ "label": "iec-61508-sil-levels/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "impact-mapping/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "impact-mapping/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "impact-mapping/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "invest/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "invest/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "invest/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "iso-25010/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "iso-25010/application-anchor",
+ "score": 0.75,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "B",
+ "correct": false,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "iso-25010/application-paraphrase",
+ "score": 0.75,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "B",
+ "correct": false,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "jobs-to-be-done/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "jobs-to-be-done/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "jobs-to-be-done/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "lasr/recognition",
+ "score": 0.25,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "A",
+ "correct": false,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "C",
+ "correct": false,
+ "raw_response": "C"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "A",
+ "correct": false,
+ "raw_response": "A"
+ }
+ ]
+ },
+ {
+ "label": "lasr/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "lasr/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "linddun/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "linddun/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "linddun/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "llm-evaluations/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "llm-evaluations/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "llm-evaluations/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "madr/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "madr/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "madr/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "mece/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "mece/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "mece/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "morphological-box/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "morphological-box/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "morphological-box/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "moscow/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "moscow/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "moscow/application-paraphrase",
+ "score": 0.25,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "A",
+ "correct": false,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "D",
+ "correct": false,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "B",
+ "correct": false,
+ "raw_response": "B"
+ }
+ ]
+ },
+ {
+ "label": "mutation-testing/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "mutation-testing/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "mutation-testing/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "negative-control/recognition",
+ "score": 0.75,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "D",
+ "answer": "B",
+ "correct": false,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ }
+ ]
+ },
+ {
+ "label": "nelson-rules/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "nelson-rules/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "nelson-rules/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "owasp-top-10/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ }
+ ]
+ },
+ {
+ "label": "owasp-top-10/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "owasp-top-10/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "plain-english-strunk-white/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "plain-english-strunk-white/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "plain-english-strunk-white/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "prd/recognition",
+ "score": 0.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "D",
+ "correct": false,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "C",
+ "correct": false,
+ "raw_response": "C"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "B",
+ "correct": false,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "A",
+ "correct": false,
+ "raw_response": "A"
+ }
+ ]
+ },
+ {
+ "label": "prd/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "prd/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "problem-space-nvc/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "problem-space-nvc/application-anchor",
+ "score": 0.75,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "C",
+ "correct": false,
+ "raw_response": "C"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "problem-space-nvc/application-paraphrase",
+ "score": 0.75,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "C",
+ "correct": false,
+ "raw_response": "C"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "property-based-testing/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "property-based-testing/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "property-based-testing/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "pyramid-principle/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "pyramid-principle/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "pyramid-principle/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "sanity-check/recognition",
+ "score": 0.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "X",
+ "answer": null,
+ "correct": false,
+ "raw_response": "None of the above options is correct, but the"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "X",
+ "answer": null,
+ "correct": false,
+ "raw_response": "None of the options provided is correct, but the"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "X",
+ "answer": null,
+ "correct": false,
+ "raw_response": "None of the options provided is correct, but the"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "X",
+ "answer": "C",
+ "correct": false,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "semantic-versioning/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "semantic-versioning/application-anchor",
+ "score": 0.5,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "C",
+ "correct": false,
+ "raw_response": "C"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "D",
+ "correct": false,
+ "raw_response": "D"
+ }
+ ]
+ },
+ {
+ "label": "semantic-versioning/application-paraphrase",
+ "score": 0.75,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "D",
+ "correct": false,
+ "raw_response": "D"
+ }
+ ]
+ },
+ {
+ "label": "socratic-method/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "socratic-method/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "socratic-method/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "sota/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "sota/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "sota/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "spc/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "spc/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "spc/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "stride/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "None of the options perfectly describe the **STRIDE"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "stride/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "stride/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "swot/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "swot/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A) SWOT Analysis\nB) Value Chain"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "swot/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "tdd-chicago-school/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "tdd-chicago-school/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "tdd-chicago-school/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "tdd-london-school/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "tdd-london-school/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "tdd-london-school/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "tdd-london-school/consistency-variant-1",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "tdd-london-school/consistency-variant-2",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "tdd-london-school/consistency-variant-3",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "tdd-london-school/consistency-language",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "testing-pyramid/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "testing-pyramid/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "testing-pyramid/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "timtowtdi/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ }
+ ]
+ },
+ {
+ "label": "timtowtdi/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "timtowtdi/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "todotxt-flavoured-markdown/recognition",
+ "score": 0.5,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "A",
+ "correct": false,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "D",
+ "correct": false,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "todotxt-flavoured-markdown/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "todotxt-flavoured-markdown/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "user-story-mapping/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "user-story-mapping/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "user-story-mapping/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "wardley-mapping/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "wardley-mapping/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "wardley-mapping/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ }
+ ]
+ },
+ "duration_seconds": 1018.4
+}
\ No newline at end of file
diff --git a/evaluations/results/pilot-20260324-192413.json b/evaluations/results/pilot-20260324-192413.json
new file mode 100644
index 0000000..8c5f85e
--- /dev/null
+++ b/evaluations/results/pilot-20260324-192413.json
@@ -0,0 +1,10442 @@
+{
+ "timestamp": "2026-03-24T19:24:13.551875+00:00",
+ "config": {
+ "models": [
+ "openai"
+ ],
+ "openai_model": "gpt-4o",
+ "mistral_model": null,
+ "deepseek_model": null,
+ "ollama_model": null,
+ "ollama_url": null,
+ "no_think": null,
+ "temperature": 0.0
+ },
+ "models": {
+ "openai": [
+ {
+ "label": "adr-according-to-nygard/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "adr-according-to-nygard/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "adr-according-to-nygard/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "arc42/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) A 12-section template for standardized software"
+ }
+ ]
+ },
+ {
+ "label": "arc42/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "arc42/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) External interfaces in Section 3 (Context"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "arc42/consistency-variant-1",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) Gernot Starke"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A) Gernot Starke"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D) Gernot Starke"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C) Gernot Starke"
+ }
+ ]
+ },
+ {
+ "label": "arc42/consistency-variant-2",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) Gernot Starke"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A) Gernot Starke"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D) Gernot Starke"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C) Gernot Starke"
+ }
+ ]
+ },
+ {
+ "label": "arc42/consistency-variant-3",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) Gernot Starke"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A) Gernot Starke"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D) Gernot Starke"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C) Gernot Starke"
+ }
+ ]
+ },
+ {
+ "label": "arc42/consistency-language",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "atam/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "atam/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "atam/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) Build a utility tree to prioritize quality scenarios"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D) Build a utility tree to prioritize quality scenarios"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C) Build a utility tree to prioritize quality scenarios"
+ }
+ ]
+ },
+ {
+ "label": "bdd-given-when-then/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "bdd-given-when-then/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "bdd-given-when-then/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D) Organize discovery workshops with the three amigos"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "bem-methodology/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "bem-methodology/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) Use structured class names like `.navbar`,"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C) Use structured class names like `.navbar`,"
+ }
+ ]
+ },
+ {
+ "label": "bem-methodology/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) Use structured class names like `.navbar`,"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D) Use structured class names like `.navbar`,"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C) Use structured class names like `.navbar`,"
+ }
+ ]
+ },
+ {
+ "label": "bluf/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "bluf/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "bluf/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "c4-diagrams/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A) Four levels of abstraction; : system in"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C) Four levels of abstraction; : system in"
+ }
+ ]
+ },
+ {
+ "label": "c4-diagrams/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "c4-diagrams/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "chain-of-thought/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "chain-of-thought/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "chain-of-thought/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "clean-architecture/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "clean-architecture/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) Define payment processing use cases in the core"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "clean-architecture/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "control-chart-shewhart/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C) Measured value plotted over time; process"
+ }
+ ]
+ },
+ {
+ "label": "control-chart-shewhart/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "control-chart-shewhart/application-paraphrase",
+ "score": 0.75,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "A",
+ "correct": false,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "conventional-commits/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "conventional-commits/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "conventional-commits/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "cqrs/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "cqrs/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "cqrs/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) Create separate optimized data models: a normalized"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A) Create separate optimized data models: a normalized"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C) Create separate optimized data models: a normalized"
+ }
+ ]
+ },
+ {
+ "label": "cynefin-framework/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) Five domains; : best practices apply,"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C) Five domains; : best practices apply,"
+ }
+ ]
+ },
+ {
+ "label": "cynefin-framework/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "cynefin-framework/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "definition-of-done/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "definition-of-done/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "definition-of-done/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) Collaboratively create a single, team-wide"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "devils-advocate/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A) Present opposing viewpoints even if not personally held"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "devils-advocate/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "devils-advocate/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A) Systematically argue against your own design by"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "diataxis-framework/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D) Four documentation types; : learning-oriented,"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "diataxis-framework/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) Develop four distinct documentation sections: beginner tutorials"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "diataxis-framework/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) Develop four distinct documentation sections: beginner tutorials"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D) Develop four distinct documentation sections: beginner tutorials"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "docs-as-code/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "docs-as-code/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "docs-as-code/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "domain-driven-design/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "domain-driven-design/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "domain-driven-design/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) Establish a ubiquitous language by working closely with"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "ears-requirements/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "ears-requirements/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) Structure requirements using specific templates: 'The"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D) Structure requirements using specific templates: 'The"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "ears-requirements/application-paraphrase",
+ "score": 0.75,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) Structure requirements using specific templates: 'The"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "B",
+ "correct": false,
+ "raw_response": "B) Create a comprehensive requirements specification document with functional"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C) Structure requirements using specific templates: 'The"
+ }
+ ]
+ },
+ {
+ "label": "event-driven-architecture/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "event-driven-architecture/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "event-driven-architecture/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "fagan-inspection/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "fagan-inspection/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "fagan-inspection/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "feynman-technique/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "feynman-technique/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "feynman-technique/application-paraphrase",
+ "score": 0.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "C",
+ "correct": false,
+ "raw_response": "C)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "B",
+ "correct": false,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "A",
+ "correct": false,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "A",
+ "correct": false,
+ "raw_response": "A"
+ }
+ ]
+ },
+ {
+ "label": "five-whys/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "five-whys/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "five-whys/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "fowler-patterns/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "fowler-patterns/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "fowler-patterns/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) Use a Domain Model pattern for complex business"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D) Use a Domain Model pattern for complex business"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C) Use a Domain Model pattern for complex business"
+ }
+ ]
+ },
+ {
+ "label": "gherkin/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ }
+ ]
+ },
+ {
+ "label": "gherkin/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "gherkin/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "github-flow/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) Workflow steps"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A) Workflow steps"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D) Workflow steps"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "github-flow/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "github-flow/application-paraphrase",
+ "score": 0.75,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "A",
+ "correct": false,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "gutes-deutsch-wolf-schneider/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "gutes-deutsch-wolf-schneider/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "gutes-deutsch-wolf-schneider/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "hexagonal-architecture/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "hexagonal-architecture/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "hexagonal-architecture/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "iec-61508-sil-levels/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "iec-61508-sil-levels/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "iec-61508-sil-levels/application-paraphrase",
+ "score": 0.75,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "D",
+ "correct": false,
+ "raw_response": "D"
+ }
+ ]
+ },
+ {
+ "label": "impact-mapping/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D) Goal → Actors → Impacts → Deliver"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "impact-mapping/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) Map the retention goal to key actors ("
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C) Map the retention goal to key actors ("
+ }
+ ]
+ },
+ {
+ "label": "impact-mapping/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) Map the retention goal to key actors ("
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "invest/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "invest/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "invest/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A) Split this into multiple smaller stories with specific"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D) Split this into multiple smaller stories with specific"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "iso-25010/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "iso-25010/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "iso-25010/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "jobs-to-be-done/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "jobs-to-be-done/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) Interview users about the specific circumstances that led"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "jobs-to-be-done/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) Interview users about the specific circumstances that led"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A) Interview users about the specific circumstances that led"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D) Interview users about the specific circumstances that led"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C) Interview users about the specific circumstances that led"
+ }
+ ]
+ },
+ {
+ "label": "lasr/recognition",
+ "score": 0.75,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "A",
+ "correct": false,
+ "raw_response": "A"
+ }
+ ]
+ },
+ {
+ "label": "lasr/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "lasr/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "linddun/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "linddun/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "linddun/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D) Systematically analyze the system against seven privacy"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C) Systematically analyze the system against seven privacy"
+ }
+ ]
+ },
+ {
+ "label": "llm-evaluations/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "llm-evaluations/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "llm-evaluations/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "madr/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "madr/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A) Document the decision with sections for context,"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D) Document the decision with sections for context,"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "madr/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D) Document the decision with sections for context,"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "mece/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D) Structuring categories so they do not overlap"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "mece/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D) Organize by business capability: User Service"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C) Organize by business capability: User Service"
+ }
+ ]
+ },
+ {
+ "label": "mece/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D) Organize by business capability: User Service"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C) Organize by business capability: User Service"
+ }
+ ]
+ },
+ {
+ "label": "morphological-box/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) Break complex problem into independent parameters/dim"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D) Break complex problem into independent parameters/dim"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C) Break complex problem into independent parameters/dim"
+ }
+ ]
+ },
+ {
+ "label": "morphological-box/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "morphological-box/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "moscow/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "moscow/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "moscow/application-paraphrase",
+ "score": 0.75,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "C",
+ "correct": false,
+ "raw_response": "C"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "mutation-testing/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "mutation-testing/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "mutation-testing/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "negative-control/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ }
+ ]
+ },
+ {
+ "label": "nelson-rules/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "nelson-rules/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "nelson-rules/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "owasp-top-10/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ }
+ ]
+ },
+ {
+ "label": "owasp-top-10/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "owasp-top-10/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "plain-english-strunk-white/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "plain-english-strunk-white/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "plain-english-strunk-white/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "prd/recognition",
+ "score": 0.75,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "A",
+ "correct": false,
+ "raw_response": "A"
+ }
+ ]
+ },
+ {
+ "label": "prd/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "prd/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) Write a comprehensive document that defines the problem"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D) Write a comprehensive document that defines the problem"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "problem-space-nvc/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) Concrete, objective facts without evaluation or judgment"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "problem-space-nvc/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "problem-space-nvc/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "property-based-testing/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D) Invariants that should always hold; automatic"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "property-based-testing/application-anchor",
+ "score": 0.75,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) Define mathematical invariants like 'interest calculations"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "A",
+ "correct": false,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "property-based-testing/application-paraphrase",
+ "score": 0.75,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "D",
+ "correct": false,
+ "raw_response": "D) Write comprehensive unit tests covering typical financial scenarios"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D) Define mathematical invariants like 'interest calculations"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "pyramid-principle/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "pyramid-principle/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "pyramid-principle/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "sanity-check/recognition",
+ "score": 0.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "X",
+ "answer": "A",
+ "correct": false,
+ "raw_response": "None of the options provided are correct. The Answer"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "X",
+ "answer": "A",
+ "correct": false,
+ "raw_response": "None of the options provided correspond to the Answer to"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "X",
+ "answer": "A",
+ "correct": false,
+ "raw_response": "None of the options provided correspond to the Answer to"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "X",
+ "answer": "A",
+ "correct": false,
+ "raw_response": "None of the options provided correspond to the Answer to"
+ }
+ ]
+ },
+ {
+ "label": "semantic-versioning/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "semantic-versioning/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A) 3.0.0 - because"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "semantic-versioning/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "socratic-method/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "socratic-method/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "socratic-method/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "sota/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "sota/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "sota/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "spc/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "spc/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "spc/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "stride/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "None of the options provided accurately describe the STRIDE"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "stride/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "stride/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "swot/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "swot/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C) Analyze internal strengths and weaknesses of your current"
+ }
+ ]
+ },
+ {
+ "label": "swot/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) Analyze internal strengths and weaknesses of your current"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "tdd-chicago-school/recognition",
+ "score": 0.75,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "D",
+ "correct": false,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "tdd-chicago-school/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "tdd-chicago-school/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D) Begin with tests for the core pricing calculations"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "tdd-london-school/recognition",
+ "score": 0.75,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "D",
+ "correct": false,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "tdd-london-school/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) Write a test that mocks PaymentGateway and"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "tdd-london-school/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "tdd-london-school/consistency-variant-1",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) Steve Freeman"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A) Steve Freeman"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D) Steve Freeman"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C) Steve Freeman"
+ }
+ ]
+ },
+ {
+ "label": "tdd-london-school/consistency-variant-2",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) Steve Freeman"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A) Steve Freeman"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D) Steve Freeman"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C) Steve Freeman"
+ }
+ ]
+ },
+ {
+ "label": "tdd-london-school/consistency-variant-3",
+ "score": 0.5,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "C",
+ "correct": false,
+ "raw_response": "C) Dan North"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "B",
+ "correct": false,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D) Steve Freeman"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C) Steve Freeman"
+ }
+ ]
+ },
+ {
+ "label": "tdd-london-school/consistency-language",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D) Steve Freeman"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "testing-pyramid/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) Three layers; more unit tests, fewer"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A) Three layers; more unit tests, fewer"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "testing-pyramid/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "testing-pyramid/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "timtowtdi/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ }
+ ]
+ },
+ {
+ "label": "timtowtdi/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "timtowtdi/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "todotxt-flavoured-markdown/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "todotxt-flavoured-markdown/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "todotxt-flavoured-markdown/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "user-story-mapping/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "user-story-mapping/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "user-story-mapping/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A)"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "wardley-mapping/recognition",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C)"
+ }
+ ]
+ },
+ {
+ "label": "wardley-mapping/application-anchor",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B)"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ },
+ {
+ "label": "wardley-mapping/application-paraphrase",
+ "score": 1.0,
+ "results": [
+ {
+ "permutation": [
+ "A",
+ "B",
+ "C",
+ "D"
+ ],
+ "expected": "B",
+ "answer": "B",
+ "correct": true,
+ "raw_response": "B) Map the payment processing component's position on"
+ },
+ {
+ "permutation": [
+ "B",
+ "C",
+ "D",
+ "A"
+ ],
+ "expected": "A",
+ "answer": "A",
+ "correct": true,
+ "raw_response": "A"
+ },
+ {
+ "permutation": [
+ "C",
+ "D",
+ "A",
+ "B"
+ ],
+ "expected": "D",
+ "answer": "D",
+ "correct": true,
+ "raw_response": "D)"
+ },
+ {
+ "permutation": [
+ "D",
+ "A",
+ "B",
+ "C"
+ ],
+ "expected": "C",
+ "answer": "C",
+ "correct": true,
+ "raw_response": "C"
+ }
+ ]
+ }
+ ]
+ },
+ "duration_seconds": 938.4
+}
\ No newline at end of file
diff --git a/evaluations/specs/adr-according-to-nygard.yaml b/evaluations/specs/adr-according-to-nygard.yaml
new file mode 100644
index 0000000..95a2e80
--- /dev/null
+++ b/evaluations/specs/adr-according-to-nygard.yaml
@@ -0,0 +1,39 @@
+anchor: adr-according-to-nygard
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "ADR according to Nygard"?
+
+ '
+ options:
+ A: Comprehensive architecture documentation with twelve standardized sections
+ covering context, building blocks, runtime, and deployment views
+ B: Lightweight records that capture a single architecture decision with its
+ context, decision, status, and consequences in a short, focused format
+ C: Collaborative review process where stakeholders evaluate architecture tradeoffs
+ through scenario-based analysis and quality attribute workshops
+ D: Visual modeling technique that represents software architecture at four levels
+ of abstraction from system context down to code
+ correct: B
+ application:
+ scenario: Your team is building a microservices platform and needs to choose between
+ REST APIs and GraphQL for service communication. The decision involves trade-offs
+ between performance, complexity, team expertise, and future scalability requirements.
+ anchor_prompt: using ADR according to Nygard
+ paraphrase_prompt: How should you document this architectural choice to ensure
+ future team members understand the reasoning and can make informed decisions
+ about related architecture changes?
+ options:
+ A: Create a comprehensive design document with detailed technical specifications,
+ implementation guidelines, and performance benchmarks that can be updated
+ as requirements change.
+ B: Write a concise record with the decision title, current status, context explaining
+ the forces at play, the chosen solution, and both positive and negative consequences,
+ then store it immutably with the codebase.
+ C: Document the decision in the project wiki with a detailed comparison matrix,
+ stakeholder approval signatures, and a change log for future modifications
+ to the decision.
+ D: Prepare a formal architecture review presentation covering the evaluation
+ criteria, alternative solutions considered, and implementation roadmap, then
+ archive it in the project repository.
+ correct: B
diff --git a/evaluations/specs/arc42.yaml b/evaluations/specs/arc42.yaml
new file mode 100644
index 0000000..7532be9
--- /dev/null
+++ b/evaluations/specs/arc42.yaml
@@ -0,0 +1,40 @@
+anchor: arc42
+tier: 3
+
+questions:
+ recognition:
+ question: |
+ Which of the following best describes "arc42"?
+ options:
+ A: A 12-section template for standardized software architecture documentation
+ B: A framework for automated architecture compliance checking in CI pipelines
+ C: A UML-based notation for modeling software components and connectors
+ D: A methodology for evaluating architecture trade-offs against quality attributes
+ correct: A
+
+ application:
+ scenario: |
+ You are starting architecture documentation for a new microservices project.
+ The team needs to understand the system's external interfaces and deployment.
+ Where do you document these two concerns?
+ anchor_prompt: "following arc42"
+ paraphrase_prompt: "Create structured architecture documentation covering all relevant aspects"
+ options:
+ A: Both in a single Architecture Overview chapter
+ B: External interfaces in Section 3 (Context and Scope), deployment in Section 7 (Deployment View)
+ C: External interfaces in the API specification, deployment in the runbook
+ D: Both in Section 5 (Building Block View) at different abstraction levels
+ correct: B
+
+ consistency:
+ variants:
+ - 'Which of the following persons is most closely associated with "arc42"?'
+ - 'Who created the arc42 architecture documentation template?'
+ - 'Which author is known for the arc42 documentation framework?'
+ language_variant: 'Welche Person wird am engsten mit "arc42" assoziiert?'
+ options:
+ A: Martin Fowler
+ B: Gernot Starke
+ C: Robert C. Martin
+ D: Simon Brown
+ correct: B
diff --git a/evaluations/specs/atam.yaml b/evaluations/specs/atam.yaml
new file mode 100644
index 0000000..a04100b
--- /dev/null
+++ b/evaluations/specs/atam.yaml
@@ -0,0 +1,38 @@
+anchor: atam
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "ATAM"?
+
+ '
+ options:
+ A: Systematic documentation framework for software architecture decisions using
+ standardized templates that capture context, decision drivers, and consequences
+ B: Iterative risk-driven methodology for evaluating software architectures through
+ stakeholder workshops that identify risks, sensitivity points, and tradeoffs
+ using scenario-based analysis
+ C: Comprehensive template-based approach for documenting software architecture
+ across twelve standardized sections from context to deployment
+ D: Structured code review process with defined roles and phases that inspects
+ software artifacts against checklists and specifications
+ correct: B
+ application:
+ scenario: Your team is designing a financial trading platform where stakeholders
+ demand sub-100ms response times for trade execution, 99.99% uptime, and bank-level
+ security compliance. The architecture team has proposed using microservices
+ with event sourcing, but concerns have been raised about whether this approach
+ can simultaneously meet all quality requirements.
+ anchor_prompt: using ATAM
+ paraphrase_prompt: What systematic approach would best help evaluate whether the
+ proposed architecture can achieve the conflicting quality requirements?
+ options:
+ A: Conduct load testing on a prototype implementation to measure actual performance
+ metrics and identify bottlenecks before making architectural decisions.
+ B: Build a utility tree to prioritize quality scenarios, then analyze how microservices
+ and event sourcing create tradeoff points between performance, availability,
+ and security.
+ C: Create detailed architecture documentation with UML diagrams and have senior
+ architects review the design against established enterprise patterns.
+ D: Implement proof-of-concept services for critical components and run security
+ penetration tests to validate compliance requirements early.
+ correct: B
diff --git a/evaluations/specs/bdd-given-when-then.yaml b/evaluations/specs/bdd-given-when-then.yaml
new file mode 100644
index 0000000..4dc2139
--- /dev/null
+++ b/evaluations/specs/bdd-given-when-then.yaml
@@ -0,0 +1,44 @@
+anchor: bdd-given-when-then
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "BDD (Behavior-Driven Development)"?
+
+ '
+ options:
+ A: Test-first development approach where unit tests are written before implementation
+ code, focusing on isolated component testing with mock objects to verify interactions
+ between system boundaries
+ B: Structured scenario format — Given a precondition, When an action occurs,
+ Then an expected outcome results; concrete examples as executable specifications
+ that define system behavior
+ C: Agile requirement gathering technique that maps user activities and tasks
+ in chronological order to identify system features and prioritize development
+ based on user journey workflows
+ D: Iterative testing methodology that emphasizes state-based verification through
+ direct assertions on system outputs, promoting comprehensive test coverage
+ without external dependencies or test doubles
+ correct: B
+ application:
+ scenario: Your team is building an e-commerce checkout system where business stakeholders
+ are concerned about complex discount rules and payment validation logic. The
+ product owner, developers, and QA engineers have different interpretations of
+ how promotional codes should work with various payment methods.
+ anchor_prompt: using BDD (Behavior-Driven Development)
+ paraphrase_prompt: to ensure all stakeholders share the same understanding of
+ system behavior and create executable documentation
+ options:
+ A: Write detailed technical specifications first, then have developers implement
+ unit tests that verify the discount calculation algorithms work correctly
+ for each payment method combination.
+ B: Organize discovery workshops with the three amigos to write Given-When-Then
+ scenarios like 'Given a customer has a 20% discount code, When they checkout
+ with a credit card, Then the discount applies before payment processing' that
+ become executable tests.
+ C: Create comprehensive user stories with acceptance criteria, then have QA
+ engineers write end-to-end test scripts that validate the complete checkout
+ workflow from the user interface perspective.
+ D: Develop a prototype of the checkout system quickly, then gather feedback
+ from stakeholders through usability testing sessions to refine the discount
+ and payment features iteratively.
+ correct: B
diff --git a/evaluations/specs/bem-methodology.yaml b/evaluations/specs/bem-methodology.yaml
new file mode 100644
index 0000000..e6a7dfd
--- /dev/null
+++ b/evaluations/specs/bem-methodology.yaml
@@ -0,0 +1,42 @@
+anchor: bem-methodology
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "BEM Methodology"?
+
+ '
+ options:
+ A: A software architecture pattern that separates business logic, event handling,
+ and model validation to create scalable enterprise applications with clear
+ separation of concerns
+ B: Solve CSS specificity wars, naming conflicts, and stylesheet maintainability
+ issues in large codebases; standalone component that is meaningful on its
+ own (e.g., `menu`, `button`, `header`)
+ C: A project management methodology that emphasizes iterative development cycles,
+ continuous integration, and stakeholder feedback to deliver software products
+ efficiently
+ D: A database design approach that structures entities, relationships, and metadata
+ to optimize query performance and maintain data integrity across distributed
+ systems
+ correct: B
+ application:
+ scenario: You're developing a navigation component for an e-commerce website that
+ includes a logo, menu items, search functionality, and a shopping cart icon.
+ The navigation needs to support different states like active menu items, disabled
+ search when no products are available, and a compact version for mobile devices.
+ anchor_prompt: using BEM Methodology
+ paraphrase_prompt: structure the CSS class names to ensure maintainability, avoid
+ naming conflicts, and clearly express component relationships
+ options:
+ A: Use semantic class names like `.navigation`, `.logo`, `.menu-link`, `.search-box`,
+ `.cart`, `.active-link`, `.disabled-search`, `.mobile-nav`
+ B: Use structured class names like `.navbar`, `.navbar__logo`, `.navbar__menu-item`,
+ `.navbar__search`, `.navbar__cart`, `.navbar__menu-item--active`, `.navbar__search--disabled`,
+ `.navbar--compact`
+ C: Use hierarchical class names like `.nav`, `.nav .logo`, `.nav .menu .item`,
+ `.nav .search.box`, `.nav .cart.icon`, `.nav .menu .item.active`, `.nav .search.disabled`,
+ `.nav.mobile`
+ D: Use descriptive class names like `.main-navigation`, `.site-logo`, `.primary-menu-link`,
+ `.product-search-input`, `.shopping-cart-button`, `.current-page-link`, `.inactive-search-field`,
+ `.mobile-navigation-bar`
+ correct: B
diff --git a/evaluations/specs/bluf.yaml b/evaluations/specs/bluf.yaml
new file mode 100644
index 0000000..4e20a6a
--- /dev/null
+++ b/evaluations/specs/bluf.yaml
@@ -0,0 +1,38 @@
+anchor: bluf
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "BLUF (Bottom Line Up Front)"?
+
+ '
+ options:
+ A: Begin with background context and build logically toward the final conclusion;
+ comprehensive analysis precedes recommendations
+ B: State the main point, decision, or recommendation immediately; most important
+ information first, supporting details follow
+ C: Present multiple solutions in order of complexity; start with simple approaches
+ before advancing to detailed implementations
+ D: Structure information using inverted pyramid with broad overview first, then
+ progressively narrow to specific technical details
+ correct: B
+ application:
+ scenario: Your team has discovered a critical security vulnerability in the production
+ API that could expose user data. You need to send an urgent email to the CTO
+ and engineering leadership about the issue, its impact, and the proposed fix.
+ anchor_prompt: using BLUF (Bottom Line Up Front)
+ paraphrase_prompt: to communicate the most critical information first for time-pressed
+ executives who need to make immediate decisions
+ options:
+ A: Start with background context about recent security audits, then explain
+ how the vulnerability was discovered, detail the technical analysis process,
+ and conclude with the severity assessment and recommended actions.
+ B: 'Lead with: ''Critical API vulnerability requires immediate hotfix deployment
+ by EOD to prevent potential user data exposure.'' Follow with impact details,
+ technical specifics, and implementation timeline.'
+ C: Begin by outlining the discovery timeline, present a detailed technical analysis
+ of the vulnerability, discuss various potential solutions, then provide your
+ final recommendation and next steps.
+ D: Open with team credentials and recent security improvements, explain the
+ systematic approach used to identify issues, walk through the vulnerability
+ details, and end with proposed solutions.
+ correct: B
diff --git a/evaluations/specs/c4-diagrams.yaml b/evaluations/specs/c4-diagrams.yaml
new file mode 100644
index 0000000..9113143
--- /dev/null
+++ b/evaluations/specs/c4-diagrams.yaml
@@ -0,0 +1,39 @@
+anchor: c4-diagrams
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "C4-Diagrams"?
+
+ '
+ options:
+ A: 'Four components of system design: data, process, interface, and security
+ layers'
+ B: 'Four levels of abstraction; : system in its environment (users, external
+ systems)'
+ C: 'Four phases of software development: requirements, design, implementation,
+ and testing'
+ D: 'Four categories of architectural patterns: layered, client-server, pipe-filter,
+ and event-driven'
+ correct: B
+ application:
+ scenario: Your team is building a new e-commerce platform and needs to present
+ the architecture to various stakeholders including executives, developers, and
+ operations staff. The system involves web applications, mobile apps, payment
+ services, inventory databases, and third-party shipping APIs.
+ anchor_prompt: using C4-Diagrams
+ paraphrase_prompt: create a comprehensive architectural documentation strategy
+ that effectively communicates system structure to all stakeholder groups
+ options:
+ A: Create detailed UML class diagrams showing all system interfaces, then add
+ deployment diagrams and sequence diagrams for each major user workflow to
+ ensure complete technical coverage.
+ B: Start with a context diagram showing the system and external actors, then
+ create container diagrams for applications and databases, followed by component
+ diagrams for complex containers as needed.
+ C: Begin with a comprehensive system landscape diagram, then create detailed
+ data flow diagrams, followed by network topology diagrams and API specification
+ documents for each service.
+ D: Design entity-relationship diagrams for all databases first, then create
+ service architecture diagrams, and finish with user journey maps and technical
+ infrastructure blueprints.
+ correct: B
diff --git a/evaluations/specs/chain-of-thought.yaml b/evaluations/specs/chain-of-thought.yaml
new file mode 100644
index 0000000..bdc5550
--- /dev/null
+++ b/evaluations/specs/chain-of-thought.yaml
@@ -0,0 +1,36 @@
+anchor: chain-of-thought
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "Chain of Thought (CoT)"?
+
+ '
+ options:
+ A: Connect multiple AI models in sequence where each model's output becomes
+ the input for the next model in the processing pipeline
+ B: Explicitly show intermediate reasoning steps before reaching a conclusion;
+ make the thought process visible, not just the final answer
+ C: Structure prompts using a series of related examples that progressively guide
+ the model toward the desired response pattern
+ D: Break down complex problems into smaller, independent sub-problems that can
+ be solved separately and then combined for the final solution
+ correct: B
+ application:
+ scenario: Your team is debugging a complex data processing pipeline that produces
+ incorrect results for certain edge cases. The LLM-based component seems to jump
+ directly to conclusions without showing its reasoning process. You need to modify
+ the prompting strategy to make the model's decision-making process visible so
+ you can identify where the logic breaks down.
+ anchor_prompt: using Chain of Thought (CoT)
+ paraphrase_prompt: Which prompting approach would best help you understand and
+ debug the model's reasoning process for complex multi-step problems?
+ options:
+ A: Add more examples to the prompt with only the final correct answers, then
+ use temperature=0 for consistent outputs across all test cases.
+ B: Modify prompts to include phrases like 'Let's think step by step' and provide
+ examples that show intermediate reasoning steps before reaching conclusions.
+ C: Increase the context window size and provide comprehensive background documentation
+ about all possible edge cases and their solutions.
+ D: Use ensemble methods by running multiple model instances with different random
+ seeds and selecting the most frequently occurring answer.
+ correct: B
diff --git a/evaluations/specs/clean-architecture.yaml b/evaluations/specs/clean-architecture.yaml
new file mode 100644
index 0000000..42eaf0b
--- /dev/null
+++ b/evaluations/specs/clean-architecture.yaml
@@ -0,0 +1,39 @@
+anchor: clean-architecture
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "Clean Architecture"?
+
+ '
+ options:
+ A: Dependencies flow bidirectionally between layers; presentation ↔ business
+ logic ↔ data access ↔ external services
+ B: Dependencies only point inward; entities → use cases → interface adapters
+ → frameworks & drivers
+ C: Code is organized by technical concerns; controllers → services → repositories
+ → database models
+ D: System components are loosely coupled through message passing; publishers
+ → message brokers → subscribers → event handlers
+ correct: B
+ application:
+ scenario: Your team is developing an e-commerce platform that needs to support
+ multiple payment processors (Stripe, PayPal, Square) and may need to switch
+ between them based on business requirements. The payment processing logic contains
+ complex fraud detection rules and transaction validation that must remain consistent
+ regardless of which payment provider is used.
+ anchor_prompt: using Clean Architecture
+ paraphrase_prompt: to ensure the core business logic remains independent of external
+ payment providers while maintaining flexibility to switch between them
+ options:
+ A: Create a shared payment utility class that contains all provider-specific
+ code and business rules, then inject different configuration objects to switch
+ between providers
+ B: Define payment processing use cases in the core layer with abstract interfaces,
+ implement provider-specific adapters in the outer layer, and inject dependencies
+ inward through dependency inversion
+ C: Build separate microservices for each payment provider with a central orchestrator
+ service that routes requests and handles all business logic validation
+ D: Implement a factory pattern that returns different payment processor instances,
+ with each processor containing its own copy of the fraud detection and validation
+ logic
+ correct: B
diff --git a/evaluations/specs/control-chart-shewhart.yaml b/evaluations/specs/control-chart-shewhart.yaml
new file mode 100644
index 0000000..2774073
--- /dev/null
+++ b/evaluations/specs/control-chart-shewhart.yaml
@@ -0,0 +1,32 @@
+anchor: control-chart-shewhart
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "Control Chart (Shewhart)"?
+
+ '
+ options:
+ A: Graphical representation of system architecture components and their dependencies
+ B: Measured value plotted over time; process mean
+ C: Visual workflow diagram showing sequential steps in a development process
+ D: Matrix displaying test coverage metrics across different software modules
+ correct: B
+ application:
+ scenario: Your team is monitoring API response times for a critical microservice
+ that handles user authentication. Over the past month, you've collected response
+ time measurements every hour during business hours. The service occasionally
+ experiences unexplained spikes in response time that affect user experience.
+ anchor_prompt: using Control Chart (Shewhart)
+ paraphrase_prompt: to systematically distinguish between normal performance fluctuations
+ and genuine performance issues that require investigation
+ options:
+ A: Set fixed thresholds at 95th and 99th percentiles of historical data, then
+ alert whenever current measurements exceed these static boundaries
+ B: Plot response times over time with a centerline at the process mean and control
+ limits at ±3 standard deviations, then investigate points outside these limits
+ or patterns within the limits
+ C: Use machine learning anomaly detection to automatically identify outliers
+ based on complex multivariate patterns in the time series data
+ D: Calculate rolling averages over 24-hour windows and trigger alerts when the
+ current average deviates more than 20% from the previous day's average
+ correct: B
diff --git a/evaluations/specs/conventional-commits.yaml b/evaluations/specs/conventional-commits.yaml
new file mode 100644
index 0000000..a667196
--- /dev/null
+++ b/evaluations/specs/conventional-commits.yaml
@@ -0,0 +1,44 @@
+anchor: conventional-commits
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "Conventional Commits"?
+
+ '
+ options:
+ A: A branching strategy where feature branches follow naming conventions like
+ feature/JIRA-123 with mandatory code review before merging to main
+ B: '[!][(optional scope)]: + optional body/footer; common
+ types'
+ C: A software architecture pattern that enforces strict separation between business
+ logic and infrastructure through standardized interface contracts
+ D: A deployment methodology that requires all releases to pass through predefined
+ stages with automated gates and rollback capabilities
+ correct: B
+ application:
+ scenario: Your team is preparing to release version 2.1.3 of your API library.
+ During code review, you notice that one developer's pull request removes a deprecated
+ method that some users might still be calling, while another developer's PR
+ adds a new optional parameter to an existing function. The team lead wants all
+ commit messages to clearly indicate how these changes should affect the next
+ version number.
+ anchor_prompt: using Conventional Commits
+ paraphrase_prompt: structure the commit messages to clearly communicate the semantic
+ versioning impact of these changes
+ options:
+ A: 'feat: remove deprecated getUserData method and add timeout parameter to
+ fetchUser function'
+ B: 'feat!: remove deprecated getUserData method
+
+
+ BREAKING CHANGE: getUserData method no longer available
+
+
+ feat: add optional timeout parameter to fetchUser function'
+ C: 'refactor: remove getUserData method
+
+
+ enhancement: add timeout parameter to fetchUser function'
+ D: 'chore: clean up deprecated getUserData method and improve fetchUser function
+ with timeout support'
+ correct: B
diff --git a/evaluations/specs/cqrs.yaml b/evaluations/specs/cqrs.yaml
new file mode 100644
index 0000000..36bd963
--- /dev/null
+++ b/evaluations/specs/cqrs.yaml
@@ -0,0 +1,46 @@
+anchor: cqrs
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "CQRS (Command Query Responsibility
+ Segregation)"?
+
+ '
+ options:
+ A: An architectural pattern that separates business logic from infrastructure
+ concerns by defining ports and adapters, where the core domain remains independent
+ of external systems and frameworks
+ B: Bertrand Meyer's principle — methods either change state (commands) or return
+ data (queries), never both; write operations that change state and return
+ void; represent intent as immutable command objects
+ C: A design approach where complex business domains are modeled through ubiquitous
+ language, bounded contexts, and aggregate roots to align software structure
+ with business requirements
+ D: A distributed system pattern that ensures data consistency across microservices
+ by coordinating transactions through a central orchestrator that manages compensating
+ actions for failures
+ correct: B
+ application:
+ scenario: Your e-commerce platform handles millions of product searches daily
+ but only thousands of inventory updates per hour. The current unified data model
+ causes performance bottlenecks as complex search queries with filters, sorting,
+ and recommendations compete for database resources with critical inventory management
+ operations.
+ anchor_prompt: using CQRS (Command Query Responsibility Segregation)
+ paraphrase_prompt: How would you architect the system to optimize both the high-volume
+ search operations and the critical inventory updates without them interfering
+ with each other?
+ options:
+ A: Implement database sharding to distribute both search and inventory operations
+ across multiple database instances, using product categories as the sharding
+ key to balance the load evenly.
+ B: 'Create separate optimized data models: a normalized write model for inventory
+ commands and denormalized read models for search queries, synchronized through
+ domain events with eventual consistency.'
+ C: Use a master-slave database replication setup where all inventory updates
+ go to the master and search queries are distributed across multiple read replicas
+ to reduce contention.
+ D: Implement a caching layer with Redis to store frequently accessed product
+ data and search results, reducing database load while maintaining a single
+ unified data model.
+ correct: B
diff --git a/evaluations/specs/cynefin-framework.yaml b/evaluations/specs/cynefin-framework.yaml
new file mode 100644
index 0000000..6bccb1a
--- /dev/null
+++ b/evaluations/specs/cynefin-framework.yaml
@@ -0,0 +1,39 @@
+anchor: cynefin-framework
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "Cynefin Framework"?
+
+ '
+ options:
+ A: Strategic mapping technique that visualizes value chains and component evolution
+ stages over time
+ B: 'Five domains; : best practices apply, sense-categorize-respond'
+ C: 'Agile methodology framework with four iterative phases: plan-do-check-act
+ for continuous improvement'
+ D: 'Decision-making model using three assessment layers: context-analysis-action
+ for complex problems'
+ correct: B
+ application:
+ scenario: 'Your team is facing three different challenges: a well-understood database
+ migration that follows established procedures, intermittent performance issues
+ that require expert analysis, and completely unpredictable user behavior patterns
+ in a new AI feature. The team is debating how to approach each problem and allocate
+ resources effectively.'
+ anchor_prompt: using Cynefin Framework
+ paraphrase_prompt: categorize these challenges by their complexity characteristics
+ to determine the most appropriate decision-making approach for each
+ options:
+ A: Treat all three as technical problems requiring expert analysis, form specialized
+ teams for each, and conduct thorough requirements gathering before taking
+ action on any of them.
+ B: Apply best practices to the database migration, assign experts to analyze
+ the performance issues, and run small experiments to understand the AI feature
+ behavior patterns.
+ C: Prioritize all three challenges by business impact, assign the most experienced
+ developers to each, and create detailed project plans with fixed timelines
+ for resolution.
+ D: Escalate all three issues to senior architects for decision-making, document
+ comprehensive risk assessments, and implement the solutions with the highest
+ confidence levels first.
+ correct: B
diff --git a/evaluations/specs/definition-of-done.yaml b/evaluations/specs/definition-of-done.yaml
new file mode 100644
index 0000000..bd1b96c
--- /dev/null
+++ b/evaluations/specs/definition-of-done.yaml
@@ -0,0 +1,42 @@
+anchor: definition-of-done
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "Definition of Done"?
+
+ '
+ options:
+ A: A comprehensive project timeline that outlines all deliverables, milestones,
+ and dependencies from project initiation through final deployment; includes
+ resource allocation, risk assessments, and stakeholder sign-off requirements
+ B: A formal, team-wide checklist of quality criteria that every increment must
+ satisfy before it is declared "done"; concrete, verifiable conditions — e.g.,
+ code reviewed, tests passing, documentation updated, no known defects
+ C: A structured template for capturing user requirements in the format 'As a
+ [user type], I want [functionality] so that [business value]'; includes acceptance
+ criteria, story points, and priority rankings for backlog management
+ D: A prioritization framework that categorizes requirements into Must have,
+ Should have, Could have, and Won't have categories; helps teams focus on essential
+ features while managing scope and stakeholder expectations effectively
+ correct: B
+ application:
+ scenario: Your agile team has been experiencing issues with features being marked
+ as complete during sprints, only to discover missing documentation, failing
+ edge case tests, or incomplete code reviews during the final sprint review.
+ The Product Owner is frustrated because features appear done in daily standups
+ but aren't actually ready for release.
+ anchor_prompt: using Definition of Done
+ paraphrase_prompt: establish a shared understanding of what constitutes truly
+ completed work to prevent late-cycle surprises and ensure consistent quality
+ standards
+ options:
+ A: Create individual checklists for each team member based on their role and
+ expertise, allowing developers to focus on code while QA handles testing criteria
+ B: Collaboratively create a single, team-wide checklist of quality criteria
+ that every increment must satisfy before being declared complete, including
+ code review, tests passing, and documentation updated
+ C: Have the Product Owner define completion criteria for each user story individually
+ based on business value and customer requirements
+ D: Implement a post-sprint quality gate where a designated team lead reviews
+ all completed work and decides what meets release standards
+ correct: B
diff --git a/evaluations/specs/devils-advocate.yaml b/evaluations/specs/devils-advocate.yaml
new file mode 100644
index 0000000..15397f4
--- /dev/null
+++ b/evaluations/specs/devils-advocate.yaml
@@ -0,0 +1,35 @@
+anchor: devils-advocate
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "Devil''s Advocate"?
+
+ '
+ options:
+ A: Systematically identify potential failure points and weaknesses in a system
+ design before implementation begins
+ B: Present opposing viewpoints even if not personally held; question premises
+ and surface hidden assumptions
+ C: Challenge team members' technical decisions through aggressive questioning
+ to test their knowledge and expertise
+ D: Assign responsibility for identifying risks and negative outcomes to a designated
+ team member during planning sessions
+ correct: B
+ application:
+ scenario: Your team has designed a new microservices architecture that will replace
+ the current monolithic system. The design has been well-received by stakeholders
+ and addresses all known requirements. Before finalizing the architecture decision,
+ you want to ensure you haven't overlooked critical issues.
+ anchor_prompt: using Devil's Advocate
+ paraphrase_prompt: What approach should you take to identify potential weaknesses
+ in your architecture design before implementation?
+ options:
+ A: Conduct a final walkthrough with stakeholders to confirm the design meets
+ all their stated requirements and get formal sign-off
+ B: Systematically argue against your own design by presenting the strongest
+ possible case for why this architecture could fail or cause problems
+ C: Create detailed implementation timelines and resource estimates to validate
+ the feasibility of the proposed architecture
+ D: Research similar architectures used by other companies to benchmark your
+ design against industry best practices
+ correct: B
diff --git a/evaluations/specs/diataxis-framework.yaml b/evaluations/specs/diataxis-framework.yaml
new file mode 100644
index 0000000..e9fe327
--- /dev/null
+++ b/evaluations/specs/diataxis-framework.yaml
@@ -0,0 +1,37 @@
+anchor: diataxis-framework
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "Diátaxis Framework"?
+
+ '
+ options:
+ A: Agile methodology framework focusing on iterative development cycles with
+ continuous stakeholder feedback loops
+ B: 'Four documentation types; : learning-oriented, lessons for beginners'
+ C: Software architecture pattern that separates presentation, business logic,
+ and data access into distinct layers
+ D: Project management approach emphasizing cross-functional teams and rapid
+ prototyping for complex systems
+ correct: B
+ application:
+ scenario: Your team has developed a new API authentication library and needs to
+ create comprehensive documentation. Users are complaining that they can't find
+ what they need - some want to learn the basics, others need quick solutions
+ to specific problems, and developers need detailed technical specifications.
+ anchor_prompt: using Diátaxis Framework
+ paraphrase_prompt: organize the documentation to systematically address different
+ user needs and purposes
+ options:
+ A: Create a single comprehensive guide that covers everything from basic concepts
+ to advanced implementation details, organized by feature complexity from simple
+ to advanced use cases.
+ B: 'Develop four distinct documentation sections: beginner tutorials for learning,
+ task-specific how-to guides, complete API reference materials, and conceptual
+ explanations of authentication principles.'
+ C: Structure documentation around user personas, creating separate sections
+ for frontend developers, backend developers, security engineers, and project
+ managers with role-specific information.
+ D: 'Organize content chronologically following the typical development workflow:
+ planning, setup, implementation, testing, deployment, and maintenance phases.'
+ correct: B
diff --git a/evaluations/specs/docs-as-code.yaml b/evaluations/specs/docs-as-code.yaml
new file mode 100644
index 0000000..1c7c807
--- /dev/null
+++ b/evaluations/specs/docs-as-code.yaml
@@ -0,0 +1,38 @@
+anchor: docs-as-code
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "Docs-as-Code according to Ralf
+ D. Müller"?
+
+ '
+ options:
+ A: Writing documentation in a wiki with WYSIWYG editing and real-time collaboration
+ B: 'Treating documentation like source code: version-controlled, peer-reviewed,
+ and built automatically'
+ C: Generating API documentation automatically from code annotations and docstrings
+ D: Maintaining a separate documentation repository with its own release cycle
+ correct: B
+ application:
+ scenario: Your development team maintains a microservices platform with complex
+ API documentation that frequently becomes outdated when code changes. The team
+ uses Git for version control and has automated CI/CD pipelines. Management wants
+ documentation that stays synchronized with code changes and can be generated
+ in multiple formats for different stakeholders.
+ anchor_prompt: using Docs-as-Code according to Ralf D. Müller
+ paraphrase_prompt: How should you structure and manage the documentation workflow
+ to ensure it remains current and accessible?
+ options:
+ A: Create a centralized wiki system with automated API extraction, assign documentation
+ ownership to technical writers, and schedule weekly documentation reviews
+ to ensure accuracy across all services.
+ B: Write documentation in AsciiDoc format stored in Git repositories alongside
+ code, implement docToolchain with Gradle automation, use PlantUML for diagrams,
+ and require documentation updates in every pull request.
+ C: Implement a headless CMS with version control integration, create documentation
+ templates in Microsoft Word, and establish a quarterly documentation sprint
+ cycle with stakeholder review sessions.
+ D: Set up Confluence spaces linked to JIRA tickets, use embedded Lucidchart
+ diagrams, create documentation branches that merge after code releases, and
+ maintain separate review cycles for docs and code.
+ correct: B
diff --git a/evaluations/specs/domain-driven-design.yaml b/evaluations/specs/domain-driven-design.yaml
new file mode 100644
index 0000000..e50d907
--- /dev/null
+++ b/evaluations/specs/domain-driven-design.yaml
@@ -0,0 +1,41 @@
+anchor: domain-driven-design
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "Domain-Driven Design according
+ to Evans"?
+
+ '
+ options:
+ A: Architectural pattern that separates business logic into distinct layers
+ with clear interfaces between presentation, application, and data access components
+ B: Shared vocabulary between developers and domain experts; explicit boundaries
+ where a model is defined and applicable
+ C: Software development methodology that emphasizes iterative delivery of working
+ software through close collaboration between cross-functional teams and stakeholders
+ D: Design approach that focuses on creating reusable software components with
+ well-defined interfaces that can be composed into larger systems
+ correct: B
+ application:
+ scenario: Your team is building a complex insurance claims processing system where
+ business rules frequently change and involve multiple departments (underwriting,
+ claims adjustment, fraud detection, customer service). The business stakeholders
+ use terms like 'policy holder,' 'coverage limits,' and 'claim settlement' but
+ developers are implementing these concepts inconsistently across different parts
+ of the system.
+ anchor_prompt: using Domain-Driven Design according to Evans
+ paraphrase_prompt: to ensure consistent understanding and implementation of business
+ concepts across the development team and stakeholders
+ options:
+ A: Create comprehensive technical documentation that maps business terms to
+ database schemas and API endpoints, then train all developers on the correct
+ technical implementations
+ B: Establish a ubiquitous language by working closely with domain experts to
+ define shared vocabulary, then ensure this language is consistently used in
+ code, conversations, and models
+ C: Implement a centralized data dictionary service that validates all business
+ term usage across microservices and enforces standardized naming conventions
+ D: Organize regular cross-functional meetings where business stakeholders explain
+ requirements to developers using standardized requirement templates and acceptance
+ criteria
+ correct: B
diff --git a/evaluations/specs/ears-requirements.yaml b/evaluations/specs/ears-requirements.yaml
new file mode 100644
index 0000000..a87c03c
--- /dev/null
+++ b/evaluations/specs/ears-requirements.yaml
@@ -0,0 +1,40 @@
+anchor: ears-requirements
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "EARS-Requirements"?
+
+ '
+ options:
+ A: A systematic approach for evaluating and analyzing requirements through stakeholder
+ interviews and documentation review
+ B: '"The shall "; "when the shall "'
+ C: 'A framework for organizing requirements into hierarchical categories: Essential,
+ Auxiliary, Regulatory, and Supplementary'
+ D: A validation methodology that ensures requirements are Explicit, Achievable,
+ Relevant, and Specific before implementation
+ correct: B
+ application:
+ scenario: You are documenting requirements for a medical device monitoring system
+ that tracks patient vital signs. The system must handle various operational
+ states, emergency conditions, and optional features like wireless connectivity.
+ Your team needs clear, testable requirements that will support regulatory approval
+ and system verification.
+ anchor_prompt: using EARS-Requirements
+ paraphrase_prompt: structure these requirements using a systematic template-based
+ approach that ensures clarity and testability
+ options:
+ A: Write detailed user stories with acceptance criteria, organize them by epic
+ and priority, and include definition of done for each story to ensure the
+ development team understands the business value.
+ B: 'Structure requirements using specific templates: ''The system shall...''
+ for basic functions, ''WHEN alarm triggered the system shall...'' for events,
+ ''WHILE monitoring the system shall...'' for states, and ''IF battery low
+ THEN the system shall...'' for conditions.'
+ C: Create a comprehensive requirements specification document with functional
+ and non-functional sections, include use case diagrams, and establish a requirements
+ traceability matrix linking to test cases.
+ D: Define requirements as measurable objectives with key performance indicators,
+ establish SMART criteria for each requirement, and create a validation framework
+ with quantitative success metrics.
+ correct: B
diff --git a/evaluations/specs/event-driven-architecture.yaml b/evaluations/specs/event-driven-architecture.yaml
new file mode 100644
index 0000000..81ea3a9
--- /dev/null
+++ b/evaluations/specs/event-driven-architecture.yaml
@@ -0,0 +1,44 @@
+anchor: event-driven-architecture
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "Event-Driven Architecture"?
+
+ '
+ options:
+ A: Components are organized around business domains with clear boundaries, where
+ each domain contains its own models, services, and data stores that reflect
+ real-world business concepts
+ B: Components communicate by emitting and reacting to events rather than direct
+ calls; producers publish events without knowing which consumers will process
+ them
+ C: Components are structured in concentric layers with business logic at the
+ center, isolated from external concerns through dependency inversion and interface
+ abstractions
+ D: Components communicate through well-defined interfaces at the boundaries
+ while keeping core business logic independent of external frameworks, databases,
+ and user interfaces
+ correct: B
+ application:
+ scenario: Your e-commerce platform needs to handle order processing, inventory
+ updates, payment processing, and shipping notifications. Currently, the order
+ service directly calls the inventory service, payment service, and shipping
+ service synchronously, causing delays and tight coupling between services.
+ anchor_prompt: using Event-Driven Architecture
+ paraphrase_prompt: How would you redesign this system to reduce coupling between
+ services and improve scalability while ensuring all necessary business processes
+ still execute when orders are placed?
+ options:
+ A: Create a centralized order orchestrator service that manages the workflow
+ by making sequential API calls to inventory, payment, and shipping services
+ with retry logic and circuit breakers.
+ B: Have the order service publish an 'OrderPlaced' event to a message queue,
+ with inventory, payment, and shipping services subscribing to process their
+ respective tasks independently and asynchronously.
+ C: Implement a shared database that all services can read from and write to,
+ with database triggers that automatically update related tables when orders
+ are inserted.
+ D: Use a microservices gateway that routes requests to the appropriate services
+ and aggregates responses, with caching layers to improve performance between
+ service calls.
+ correct: B
diff --git a/evaluations/specs/fagan-inspection.yaml b/evaluations/specs/fagan-inspection.yaml
new file mode 100644
index 0000000..9dc9b40
--- /dev/null
+++ b/evaluations/specs/fagan-inspection.yaml
@@ -0,0 +1,42 @@
+anchor: fagan-inspection
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "Fagan Inspection"?
+
+ '
+ options:
+ A: A systematic code mutation technique where defects are artificially introduced
+ into software modules to evaluate the effectiveness of existing test suites
+ and identify gaps in test coverage.
+ B: A structured, multi-phase review process for software artifacts (requirements,
+ design, code) with defined roles and entry/exit criteria; moderator (facilitates
+ and logs), author (created the artifact), inspectors (reviewers), recorder
+ (documents defects)
+ C: A formal verification methodology that uses mathematical proofs and static
+ analysis to demonstrate software correctness without executing the program,
+ focusing on pre-conditions and post-conditions.
+ D: A risk-based assessment framework for evaluating software architecture decisions
+ through scenario-based analysis, stakeholder interviews, and systematic documentation
+ of trade-offs and quality attributes.
+ correct: B
+ application:
+ scenario: Your team is developing flight control software for a commercial aircraft.
+ The requirements document has been completed and needs to be reviewed before
+ the design phase begins. Several team members have expressed concerns about
+ potential ambiguities and missing edge cases in the requirements.
+ anchor_prompt: using Fagan Inspection
+ paraphrase_prompt: What structured approach should you take to systematically
+ review the requirements document with your team?
+ options:
+ A: Schedule a team meeting where everyone reads through the requirements together
+ and discusses any issues they notice during the session.
+ B: Assign specific roles including a moderator and recorder, have each inspector
+ study the requirements individually beforehand, then hold a formal meeting
+ to identify and classify defects systematically.
+ C: Distribute the requirements to all team members via email and ask them to
+ send back their comments within a week, then compile all feedback into a single
+ document.
+ D: Have the requirements author present the document to the team in a walkthrough
+ session where attendees can ask questions and suggest improvements in real-time.
+ correct: B
diff --git a/evaluations/specs/feynman-technique.yaml b/evaluations/specs/feynman-technique.yaml
new file mode 100644
index 0000000..7e2eb30
--- /dev/null
+++ b/evaluations/specs/feynman-technique.yaml
@@ -0,0 +1,38 @@
+anchor: feynman-technique
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "Feynman Technique"?
+
+ '
+ options:
+ A: Break down complex problems into smaller, manageable components by creating
+ visual diagrams that map dependencies and identify potential bottlenecks in
+ the system
+ B: Teach the concept in simple language as if to a beginner (traditionally "explain
+ to a 12-year-old"); when you struggle to explain, you've found gaps in your
+ understanding
+ C: Use rapid prototyping and iterative feedback loops to validate assumptions
+ early in the development process before committing to full implementation
+ D: Apply the principle of progressive disclosure by revealing information gradually
+ to users, starting with the most essential features and adding complexity
+ as needed
+ correct: B
+ application:
+ scenario: Sarah is a senior developer who needs to understand a complex distributed
+ caching system before implementing a critical feature. She's read the documentation
+ and architecture diagrams, but feels uncertain about key concepts like cache
+ coherency protocols and distributed consensus mechanisms.
+ anchor_prompt: using Feynman Technique
+ paraphrase_prompt: to identify and fill knowledge gaps about the caching system
+ options:
+ A: Create detailed technical diagrams mapping all system components and their
+ interactions, then review them with the architecture team to ensure accuracy.
+ B: Write a simple explanation of how the caching system works as if teaching
+ it to a junior developer, noting where she struggles to explain clearly, then
+ study those areas more deeply.
+ C: Build a small prototype implementation to test her assumptions about the
+ system behavior and identify any gaps through hands-on experimentation.
+ D: Schedule meetings with the original system architects to ask detailed questions
+ about implementation decisions and document their responses.
+ correct: B
diff --git a/evaluations/specs/five-whys.yaml b/evaluations/specs/five-whys.yaml
new file mode 100644
index 0000000..a246900
--- /dev/null
+++ b/evaluations/specs/five-whys.yaml
@@ -0,0 +1,40 @@
+anchor: five-whys
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "Five Whys (Ohno)"?
+
+ '
+ options:
+ A: Create five alternative solution paths and systematically evaluate each option
+ against predefined criteria to select the optimal approach
+ B: Ask "Why?" repeatedly (typically ~5 times) to drill down to root causes;
+ distinguish between surface symptoms and underlying causes
+ C: Break down complex problems into exactly five manageable components and address
+ each component using dedicated team resources
+ D: Conduct five rounds of stakeholder interviews to gather comprehensive requirements
+ and validate assumptions before implementation
+ correct: B
+ application:
+ scenario: Your team's automated deployment pipeline has failed three times this
+ week, each time requiring manual intervention to complete the release. The immediate
+ cause appears to be intermittent network timeouts during the artifact upload
+ phase, but previous attempts to increase timeout values haven't resolved the
+ underlying issue.
+ anchor_prompt: using Five Whys (Ohno)
+ paraphrase_prompt: to systematically drill down from surface symptoms to identify
+ the actionable root cause of these recurring deployment failures
+ options:
+ A: Document all three failure instances, categorize the types of network errors,
+ and create a comprehensive troubleshooting runbook for future occurrences
+ of similar timeout issues.
+ B: Ask why network timeouts occur, then why that cause exists, continuing this
+ questioning process until you reach an underlying cause that the team can
+ take concrete action to prevent.
+ C: Gather the development, infrastructure, and network teams to brainstorm all
+ possible factors contributing to deployment failures and create a fishbone
+ diagram mapping relationships between causes.
+ D: Implement monitoring dashboards to track network latency patterns, set up
+ automated alerts for timeout thresholds, and establish escalation procedures
+ for deployment pipeline failures.
+ correct: B
diff --git a/evaluations/specs/fowler-patterns.yaml b/evaluations/specs/fowler-patterns.yaml
new file mode 100644
index 0000000..50e4b29
--- /dev/null
+++ b/evaluations/specs/fowler-patterns.yaml
@@ -0,0 +1,43 @@
+anchor: fowler-patterns
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "Patterns of Enterprise Application
+ Architecture (PEAA)"?
+
+ '
+ options:
+ A: A comprehensive framework defining bounded contexts, aggregates, entities,
+ value objects, repositories, and domain services for implementing complex
+ business logic in enterprise systems
+ B: Transaction Script, Domain Model, Table Module, Service Layer; table data
+ gateway, row data gateway, active record, data mapper
+ C: An architectural approach emphasizing ports and adapters, dependency inversion,
+ use cases, interactors, and clean separation between business rules and external
+ frameworks or databases
+ D: A layered architecture pattern consisting of presentation layer, business
+ logic layer, data access layer, and cross-cutting concerns like logging, security,
+ and transaction management
+ correct: B
+ application:
+ scenario: Your team is building an e-commerce platform where customer orders involve
+ complex business rules like discount calculations, inventory checks, and shipping
+ validations. The application needs to handle high transaction volumes while
+ maintaining data consistency across multiple database tables.
+ anchor_prompt: using Patterns of Enterprise Application Architecture (PEAA)
+ paraphrase_prompt: What architectural approach would best organize the business
+ logic and data access for this complex transactional system?
+ options:
+ A: Implement a microservices architecture with each service handling a single
+ business capability, using REST APIs for communication and eventual consistency
+ for data synchronization.
+ B: Use a Domain Model pattern for complex business logic with a Data Mapper
+ pattern for persistence, complemented by a Unit of Work pattern to manage
+ transactions across multiple entities.
+ C: Create a single monolithic service with stored procedures handling all business
+ logic in the database layer, using direct SQL calls from the presentation
+ tier.
+ D: Build a reactive event-driven system using CQRS with separate read and write
+ models, implementing event sourcing to capture all state changes as immutable
+ events.
+ correct: B
diff --git a/evaluations/specs/gherkin.yaml b/evaluations/specs/gherkin.yaml
new file mode 100644
index 0000000..171f997
--- /dev/null
+++ b/evaluations/specs/gherkin.yaml
@@ -0,0 +1,40 @@
+anchor: gherkin
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "Gherkin"?
+
+ '
+ options:
+ A: A domain-specific language for writing human-readable executable specifications
+ using Given/When/Then keywords in behavior-driven development
+ B: A software testing methodology that emphasizes writing failing tests first,
+ then implementing the minimum code to make them pass
+ C: A requirements elicitation technique using targeted questions to uncover
+ assumptions and clarify stakeholder needs
+ D: A structured format for documenting acceptance criteria using natural language
+ templates with numbered steps and expected outcomes
+ correct: A
+ application:
+ scenario: Your team is developing an e-commerce checkout system where business
+ analysts need to specify payment validation rules that developers can implement
+ and testers can verify. The product owner wants to ensure that both successful
+ payments and various error conditions are properly handled, and all stakeholders
+ need to understand the expected behavior.
+ anchor_prompt: using Gherkin
+ paraphrase_prompt: structure the payment validation specifications so they can
+ serve as both human-readable documentation and automated test cases
+ options:
+ A: Write detailed technical specifications in confluence with UML diagrams showing
+ payment flow states, then create separate unit tests with mock payment gateways
+ to verify each validation rule independently.
+ B: Create feature files with scenarios that use Given-When-Then steps to describe
+ payment contexts, user actions, and expected outcomes, organizing related
+ scenarios under payment validation features with shared background steps.
+ C: Document payment rules in user story format with acceptance criteria bullets,
+ then implement integration tests that call actual payment APIs to validate
+ the complete payment processing workflow.
+ D: Build a requirements traceability matrix linking business rules to test cases,
+ then write automated UI tests that simulate user interactions with the checkout
+ form to verify payment validation behavior.
+ correct: B
diff --git a/evaluations/specs/github-flow.yaml b/evaluations/specs/github-flow.yaml
new file mode 100644
index 0000000..9560f78
--- /dev/null
+++ b/evaluations/specs/github-flow.yaml
@@ -0,0 +1,37 @@
+anchor: github-flow
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "GitHub Flow"?
+
+ '
+ options:
+ A: A semantic versioning system that automatically increments version numbers
+ based on commit message patterns and release types.
+ B: Workflow steps
+ C: A standardized format for writing commit messages that includes type, scope,
+ and description to improve project history readability.
+ D: A distributed version control architecture that enables multiple developers
+ to work on separate repositories while maintaining code synchronization.
+ correct: B
+ application:
+ scenario: Your team is working on a web application that gets deployed to production
+ multiple times per day. A critical bug has been reported by users, and you need
+ to implement a hotfix while ensuring the main branch remains stable and deployable.
+ The team follows a branch-based workflow where every change goes through code
+ review.
+ anchor_prompt: using GitHub Flow
+ paraphrase_prompt: What approach should you take to implement and deploy this
+ hotfix while maintaining continuous delivery practices?
+ options:
+ A: Create a hotfix branch from the latest release tag, implement the fix, merge
+ it back to both the release branch and main, then deploy from the release
+ branch
+ B: Create a hotfix branch from main, implement the fix, open a pull request
+ for code review, merge to main after approval, then deploy immediately from
+ main
+ C: Implement the fix directly on main branch, commit the changes, run tests
+ locally, then push and deploy if tests pass
+ D: Create a hotfix branch from main, implement the fix, merge it to a staging
+ branch for testing, then merge from staging to main after validation
+ correct: B
diff --git a/evaluations/specs/gutes-deutsch-wolf-schneider.yaml b/evaluations/specs/gutes-deutsch-wolf-schneider.yaml
new file mode 100644
index 0000000..c6e4387
--- /dev/null
+++ b/evaluations/specs/gutes-deutsch-wolf-schneider.yaml
@@ -0,0 +1,44 @@
+anchor: gutes-deutsch-wolf-schneider
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "Gutes Deutsch nach Wolf Schneider"?
+
+ '
+ options:
+ A: Structure information hierarchically with the most important conclusion first,
+ followed by supporting arguments grouped logically — each section should build
+ upon the previous one; use consistent formatting throughout.
+ B: Prefer short, direct sentences over long, complex ones — every sentence should
+ express one idea; use active constructions; avoid passive voice and impersonal
+ constructions wherever possible
+ C: Begin with the bottom line up front, presenting key findings immediately
+ — organize supporting details in order of decreasing importance; eliminate
+ unnecessary background information and focus on actionable insights.
+ D: Apply modular design principles where each component serves a single responsibility
+ — minimize dependencies between modules; favor composition over inheritance
+ and maintain loose coupling throughout the system architecture.
+ correct: B
+ application:
+ scenario: A German software company is revising their user documentation after
+ receiving complaints that it's difficult to understand. The current version
+ contains many long sentences with multiple clauses, passive constructions, and
+ abstract technical jargon. The technical writing team needs to rewrite a key
+ section explaining how users can configure system settings.
+ anchor_prompt: using Gutes Deutsch nach Wolf Schneider
+ paraphrase_prompt: to create clear, accessible German prose that prioritizes reader
+ comprehension and eliminates unnecessary complexity
+ options:
+ A: Use sophisticated vocabulary and complex sentence structures to demonstrate
+ technical expertise, incorporate industry-standard terminology throughout,
+ and maintain formal passive voice constructions to convey professional authority.
+ B: Write short, direct sentences with active voice, replace abstract noun phrases
+ with concrete verbs, eliminate filler words, and choose familiar terms over
+ technical jargon when both convey the same meaning.
+ C: Focus on comprehensive coverage by including detailed explanations for every
+ possible scenario, use subordinate clauses to show relationships between concepts,
+ and employ precise technical terminology for accuracy.
+ D: Structure content using bullet points and numbered lists exclusively, maintain
+ consistent sentence length throughout the document, and include extensive
+ cross-references to related technical specifications and standards.
+ correct: B
diff --git a/evaluations/specs/hexagonal-architecture.yaml b/evaluations/specs/hexagonal-architecture.yaml
new file mode 100644
index 0000000..085e3fb
--- /dev/null
+++ b/evaluations/specs/hexagonal-architecture.yaml
@@ -0,0 +1,41 @@
+anchor: hexagonal-architecture
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "Hexagonal Architecture (Ports
+ & Adapters)"?
+
+ '
+ options:
+ A: Six-layered application structure where each layer handles specific responsibilities
+ and communicates only with adjacent layers
+ B: Core domain at the center, isolated from external concerns; interfaces defining
+ how the application communicates
+ C: Database design pattern using six normalized tables with adapter classes
+ to handle object-relational mapping between entities
+ D: Microservices architecture pattern where six independent services communicate
+ through standardized API ports and message adapters
+ correct: B
+ application:
+ scenario: Your team is building a payment processing service that needs to support
+ multiple payment gateways (Stripe, PayPal, Square), handle requests from both
+ a web API and a mobile SDK, store transaction data in PostgreSQL, and send notifications
+ via email and SMS. The business requirements are complex but well-defined, while
+ the specific technologies may change over time.
+ anchor_prompt: using Hexagonal Architecture (Ports & Adapters)
+ paraphrase_prompt: How would you structure this system to maximize testability,
+ technology independence, and the ability to easily swap external integrations?
+ options:
+ A: Create a layered architecture with separate presentation, business logic,
+ and data access layers, using dependency injection to manage connections between
+ payment gateways and notification services.
+ B: Place payment processing domain logic at the center, define port interfaces
+ for payment gateways and notifications, then implement adapters for each external
+ service, ensuring all dependencies point inward to the core domain.
+ C: Build a microservices architecture with separate services for each payment
+ gateway, a central orchestrator service, and shared databases to maintain
+ consistency across all payment operations.
+ D: Implement a plugin-based architecture where each payment gateway and notification
+ method is a plugin, with a central registry managing plugin lifecycle and
+ a shared event bus for communication.
+ correct: B
diff --git a/evaluations/specs/iec-61508-sil-levels.yaml b/evaluations/specs/iec-61508-sil-levels.yaml
new file mode 100644
index 0000000..72bd587
--- /dev/null
+++ b/evaluations/specs/iec-61508-sil-levels.yaml
@@ -0,0 +1,35 @@
+anchor: iec-61508-sil-levels
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "IEC 61508 SIL Levels"?
+
+ '
+ options:
+ A: 'Three Quality Assurance Levels; : 10^-3^ ≤ pfd < 10^-2^ (acceptable defect
+ density)'
+ B: 'Four Safety Integrity Levels; : 10^-2^ ≤ pfd < 10^-1^ (tolerable risk reduction)'
+ C: 'Five Reliability Assessment Levels; : 10^-4^ ≤ pfd < 10^-3^ (minimum performance
+ threshold)'
+ D: 'Six Verification Testing Levels; : 10^-1^ ≤ pfd < 10^0^ (standard compliance
+ range)'
+ correct: B
+ application:
+ scenario: Your team is developing a safety instrumented system for a chemical
+ processing plant that must prevent overpressure conditions. The hazard analysis
+ indicates that failure of this safety function could result in equipment damage
+ and potential worker injury, with a tolerable risk requiring the safety system
+ to have a probability of failure on demand between 10^-3 and 10^-2.
+ anchor_prompt: using IEC 61508 SIL Levels
+ paraphrase_prompt: determine the appropriate safety integrity classification and
+ corresponding development requirements for this safety function
+ options:
+ A: Classify as SIL 1, implement basic software development practices with minimal
+ verification requirements and simple hardware architecture constraints
+ B: Classify as SIL 2, implement structured software development methods with
+ moderate verification requirements and hardware fault tolerance measures
+ C: Classify as SIL 3, implement rigorous software development processes with
+ extensive verification and validation plus high hardware fault tolerance
+ D: Classify as SIL 4, implement the most stringent development processes with
+ maximum verification requirements and highest level hardware redundancy
+ correct: B
diff --git a/evaluations/specs/impact-mapping.yaml b/evaluations/specs/impact-mapping.yaml
new file mode 100644
index 0000000..046474e
--- /dev/null
+++ b/evaluations/specs/impact-mapping.yaml
@@ -0,0 +1,39 @@
+anchor: impact-mapping
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "Impact Mapping"?
+
+ '
+ options:
+ A: User → Stories → Epics → Features; requirement decomposition methodology
+ focusing on user needs and system capabilities
+ B: Goal → Actors → Impacts → Deliverables; business objective (why?)
+ C: Problem → Analysis → Design → Implementation; systematic approach to software
+ development through structured phase transitions
+ D: Stakeholder → Requirements → Architecture → Code; traceability framework
+ linking business needs to technical implementation details
+ correct: B
+ application:
+ scenario: Your e-commerce platform team has been asked to increase customer retention
+ by 15% over the next six months. The stakeholders have different opinions on
+ what features to build, with marketing wanting loyalty programs, engineering
+ suggesting performance improvements, and customer service pushing for better
+ support tools.
+ anchor_prompt: using Impact Mapping
+ paraphrase_prompt: to create a goal-oriented plan that connects business outcomes
+ to specific deliverables while identifying key stakeholders
+ options:
+ A: Create a feature prioritization matrix ranking loyalty programs, performance
+ improvements, and support tools based on development effort and expected customer
+ impact scores.
+ B: Map the retention goal to key actors (existing customers, support agents,
+ marketing team), identify how their behaviors need to change, then determine
+ what deliverables enable those behavior changes.
+ C: Conduct user story mapping sessions with all stakeholders to create a shared
+ product backlog organized by customer journey stages and prioritized by business
+ value.
+ D: Develop a roadmap showing three parallel workstreams for loyalty features,
+ performance optimization, and support enhancements with clear milestones and
+ dependencies.
+ correct: B
diff --git a/evaluations/specs/invest.yaml b/evaluations/specs/invest.yaml
new file mode 100644
index 0000000..8aea817
--- /dev/null
+++ b/evaluations/specs/invest.yaml
@@ -0,0 +1,45 @@
+anchor: invest
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "INVEST"?
+
+ '
+ options:
+ A: Stories should be prioritized using Must have, Should have, Could have, and
+ Won't have categories to ensure critical requirements are delivered first;
+ stakeholders rank features by business value and implementation complexity
+ to guide sprint planning and resource allocation.
+ B: Stories should be self-contained and deliverable in any order; avoid dependencies
+ between stories that force a fixed implementation sequence; stories are not
+ contracts; the details are open to discussion between team and stakeholders
+ until they enter a sprint
+ C: Stories should be mapped chronologically along a user journey timeline with
+ supporting tasks underneath; teams visualize the complete user experience
+ to identify gaps, prioritize releases, and maintain focus on delivering end-to-end
+ value through iterative development cycles.
+ D: Stories should follow a standardized template with acceptance criteria, definition
+ of done, and effort estimates; teams must complete detailed analysis and obtain
+ formal approval from product owners before any story can be moved into development
+ or testing phases.
+ correct: B
+ application:
+ scenario: 'Your team is reviewing user stories during backlog refinement for an
+ e-commerce platform. One story reads: ''As a customer, I want the system to
+ be faster and more secure so that I have a better experience.'' The team is
+ struggling to estimate this story and cannot determine what ''done'' looks like.'
+ anchor_prompt: using INVEST
+ paraphrase_prompt: What should you do to make this story ready for sprint planning?
+ options:
+ A: Add more detailed technical specifications about performance benchmarks and
+ security protocols, then assign it to the most senior developer who can handle
+ the complexity
+ B: Split this into multiple smaller stories with specific acceptance criteria,
+ such as 'reduce page load time to under 2 seconds' and 'implement two-factor
+ authentication for login'
+ C: Move the story to the next release cycle and create a technical spike to
+ research all possible performance and security improvements before writing
+ any user stories
+ D: Keep the story as-is but add story points based on the team's gut feeling,
+ since users clearly value performance and security improvements
+ correct: B
diff --git a/evaluations/specs/iso-25010.yaml b/evaluations/specs/iso-25010.yaml
new file mode 100644
index 0000000..2cd2dcb
--- /dev/null
+++ b/evaluations/specs/iso-25010.yaml
@@ -0,0 +1,51 @@
+anchor: iso-25010
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "ISO/IEC 25010"?
+
+ '
+ options:
+ A: A comprehensive framework for software architecture evaluation that provides
+ systematic methods to assess quality attributes through scenario-based analysis,
+ stakeholder workshops, and risk identification processes to determine architectural
+ trade-offs and decisions.
+ B: 'Eight top-level quality characteristics that describe the quality of a software
+ product: Functional Suitability, Performance Efficiency, Compatibility, Usability,
+ Reliability, Security, Maintainability, and Portability; the degree to which
+ the product provides functions that meet stated and implied needs — sub-characteristics:
+ functional completeness, functional correctness, functional appropriateness'
+ C: A standardized template structure consisting of twelve sections for documenting
+ software architecture decisions including context, functional requirements,
+ building blocks, runtime views, deployment views, and architectural decisions
+ with their rationales and consequences.
+ D: A lightweight methodology for capturing and communicating architectural decisions
+ through structured records that document the title, status, context, decision
+ rationale, and consequences of significant architectural choices made during
+ software development projects.
+ correct: B
+ application:
+ scenario: Your development team is building a new mobile banking application and
+ needs to establish quality requirements for the project. The product owner has
+ expressed concerns about user satisfaction, system downtime, and data protection,
+ but the requirements are currently vague and unmeasurable. The team needs a
+ structured approach to define specific, testable quality criteria that align
+ with industry standards.
+ anchor_prompt: using ISO/IEC 25010
+ paraphrase_prompt: to establish comprehensive, measurable quality requirements
+ that cover all critical aspects of software quality
+ options:
+ A: Focus primarily on functional requirements and add basic performance benchmarks,
+ security protocols, and user acceptance criteria as secondary considerations
+ to be refined during testing phases.
+ B: 'Define specific measurable requirements across the eight quality characteristics:
+ functional suitability, performance efficiency, compatibility, usability,
+ reliability, security, maintainability, and portability, with concrete sub-characteristics
+ for each.'
+ C: Create a custom quality framework based on stakeholder interviews, competitor
+ analysis, and industry best practices, then map these findings to relevant
+ testing strategies and acceptance criteria.
+ D: Establish quality gates focused on code coverage, automated testing results,
+ security scan outcomes, and user story completion rates to ensure comprehensive
+ quality measurement throughout development.
+ correct: B
diff --git a/evaluations/specs/jobs-to-be-done.yaml b/evaluations/specs/jobs-to-be-done.yaml
new file mode 100644
index 0000000..f659523
--- /dev/null
+++ b/evaluations/specs/jobs-to-be-done.yaml
@@ -0,0 +1,37 @@
+anchor: jobs-to-be-done
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "Jobs To Be Done (JTBD)"?
+
+ '
+ options:
+ A: A systematic approach to breaking down complex projects into smaller, manageable
+ tasks with clear deliverables
+ B: Progress people want to make in a particular context; practical task to accomplish
+ C: A framework for defining user personas and their specific roles within an
+ organization or system
+ D: A methodology for mapping employee responsibilities to business objectives
+ and performance metrics
+ correct: B
+ application:
+ scenario: A fitness app company is struggling with low user retention despite
+ having comprehensive features like workout tracking, nutrition logging, and
+ social sharing. Users download the app but stop using it within two weeks. The
+ product team needs to understand why users aren't sticking with their solution.
+ anchor_prompt: using Jobs To Be Done (JTBD)
+ paraphrase_prompt: What approach should the product team take to understand the
+ underlying reasons users seek fitness solutions and why they abandon the current
+ app?
+ options:
+ A: Conduct user surveys asking about preferred features, UI design feedback,
+ and demographic information to identify which user segments need different
+ functionality
+ B: Interview users about the specific circumstances that led them to seek a
+ fitness solution, what progress they were trying to make, and what they hired
+ instead when they stopped using the app
+ C: Analyze competitor apps to identify missing features and benchmark against
+ industry best practices for user engagement and retention metrics
+ D: Create detailed user personas based on age, fitness level, and lifestyle
+ to design targeted features for each segment and improve onboarding flows
+ correct: B
diff --git a/evaluations/specs/lasr.yaml b/evaluations/specs/lasr.yaml
new file mode 100644
index 0000000..f91be6f
--- /dev/null
+++ b/evaluations/specs/lasr.yaml
@@ -0,0 +1,44 @@
+anchor: lasr
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "LASR according to Toth/Zörner"?
+
+ '
+ options:
+ A: Lightweight Architecture Decision Records that capture the context, decision,
+ and consequences of significant architectural choices using a standardized
+ template format for documentation and communication purposes
+ B: High-level description of how the solution addresses the most important quality
+ requirements and constraints; the central architectural ideas that shape the
+ system; key structural and runtime views showing the main building blocks,
+ their responsibilities, and how they interact at runtime
+ C: Low-level technical specification that defines implementation details, coding
+ standards, and deployment procedures; focuses on concrete technology choices
+ and step-by-step guidance for development teams
+ D: Hierarchical visual modeling technique using context, container, component,
+ and code diagrams to represent software architecture at different levels of
+ abstraction for stakeholder communication and system understanding
+ correct: B
+ application:
+ scenario: Your team has just completed the initial architecture design for a new
+ e-commerce platform that must handle high traffic loads and integrate with multiple
+ payment providers. The product owner and development teams need to understand
+ the key architectural decisions before implementation begins.
+ anchor_prompt: using LASR according to Toth/Zörner
+ paraphrase_prompt: What should you focus on when creating a lightweight architecture
+ document that effectively communicates the essential architectural information
+ to stakeholders?
+ options:
+ A: Document the complete system context, detailed component specifications,
+ comprehensive deployment views, and full traceability matrices to ensure nothing
+ is missed.
+ B: Focus on the core solution strategy for handling traffic loads, key structural
+ components and their interactions, critical interfaces with payment providers,
+ and identified scalability risks with mitigation plans.
+ C: Create detailed use case diagrams, complete database schemas, exhaustive
+ API documentation, and comprehensive testing strategies to cover all system
+ aspects.
+ D: Emphasize stakeholder concerns, detailed quality scenarios, complete constraint
+ documentation, and thorough architectural decision records with full rationale.
+ correct: B
diff --git a/evaluations/specs/linddun.yaml b/evaluations/specs/linddun.yaml
new file mode 100644
index 0000000..c5e95a6
--- /dev/null
+++ b/evaluations/specs/linddun.yaml
@@ -0,0 +1,39 @@
+anchor: linddun
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "LINDDUN"?
+
+ '
+ options:
+ A: A security threat modeling framework that categorizes threats into Spoofing,
+ Tampering, Repudiation, Information Disclosure, Denial of Service, and Elevation
+ of Privilege
+ B: A privacy threat modeling framework that systematically identifies threats
+ like Linkability, Identifiability, Non-repudiation, Detectability, Disclosure
+ of information, Unawareness, and Non-compliance
+ C: A data protection compliance checklist for ensuring GDPR conformity through
+ technical and organizational measures
+ D: A risk assessment methodology that evaluates security controls against a
+ catalog of known vulnerabilities and attack patterns
+ correct: B
+ application:
+ scenario: A healthcare startup is developing a telemedicine platform that collects
+ patient medical records, processes video consultations, and shares data with
+ insurance providers. The development team needs to systematically identify potential
+ privacy risks before the platform launches to ensure GDPR compliance.
+ anchor_prompt: using LINDDUN
+ paraphrase_prompt: What systematic approach should the team take to comprehensively
+ identify and categorize privacy threats across all aspects of their data handling
+ practices?
+ options:
+ A: Focus primarily on encryption and access controls, then conduct penetration
+ testing to identify vulnerabilities in data transmission and storage mechanisms.
+ B: 'Systematically analyze the system against seven privacy threat categories:
+ Linkability, Identifiability, Non-repudiation, Detectability, Disclosure,
+ Unawareness, and Non-compliance to identify specific privacy risks.'
+ C: Implement a risk assessment matrix focusing on data classification, user
+ authentication, and regulatory compliance requirements across different jurisdictions.
+ D: Conduct stakeholder interviews to understand privacy concerns, then map data
+ flows and apply general security threat modeling techniques to identify risks.
+ correct: B
diff --git a/evaluations/specs/llm-evaluations.yaml b/evaluations/specs/llm-evaluations.yaml
new file mode 100644
index 0000000..40c4f47
--- /dev/null
+++ b/evaluations/specs/llm-evaluations.yaml
@@ -0,0 +1,44 @@
+anchor: llm-evaluations
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "LLM-Evaluations"?
+
+ '
+ options:
+ A: Automated testing frameworks that generate adversarial prompts and edge cases
+ to identify failure modes in language models through mutation-based prompt
+ engineering and stress testing methodologies.
+ B: Standardized datasets and tasks used to compare LLM capabilities — MMLU (Massive
+ Multitask Language Understanding), HellaSwag, HumanEval, BIG-Bench, GSM8K,
+ TruthfulQA, ARC; quantitative measures of model quality — perplexity, accuracy,
+ bleu, rouge, f1, pass@k (code generation), exact match, calibration
+ C: Architectural design patterns and best practices for deploying large language
+ models in production environments, including load balancing, caching strategies,
+ model versioning, and API gateway configurations.
+ D: Chain-of-thought reasoning techniques that enable language models to break
+ down complex problems into step-by-step logical processes, improving performance
+ on mathematical and analytical tasks through structured prompting.
+ correct: B
+ application:
+ scenario: Your team has developed a new code generation LLM and needs to compare
+ its performance against existing models like GPT-4 and Claude before deciding
+ whether to deploy it in production. The model will be used for generating Python
+ functions from natural language descriptions in your company's development workflow.
+ anchor_prompt: using LLM-Evaluations
+ paraphrase_prompt: What systematic approach should you take to rigorously assess
+ and compare your model's capabilities against established alternatives?
+ options:
+ A: Run your model on a few hand-selected coding problems, measure basic accuracy,
+ and compare the results against published performance numbers from other models'
+ documentation.
+ B: Evaluate on standardized benchmarks like HumanEval using pass@k metrics,
+ conduct human preference comparisons for code quality, and test for potential
+ training data contamination across multiple coding tasks.
+ C: Deploy all models to a staging environment, collect user feedback over several
+ weeks, and choose the model that receives the highest average satisfaction
+ ratings from developers.
+ D: Measure inference speed and memory usage across different model sizes, then
+ select the model that provides the best performance-to-cost ratio for your
+ infrastructure requirements.
+ correct: B
diff --git a/evaluations/specs/madr.yaml b/evaluations/specs/madr.yaml
new file mode 100644
index 0000000..3f574a0
--- /dev/null
+++ b/evaluations/specs/madr.yaml
@@ -0,0 +1,37 @@
+anchor: madr
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "MADR"?
+
+ '
+ options:
+ A: Agile methodology for iterative decision-making processes; collaborative
+ framework
+ B: Well-defined format with specific sections; standard fields
+ C: Software architecture pattern for microservices decomposition; modular design
+ approach
+ D: Risk assessment framework for technical debt management; evaluation methodology
+ correct: B
+ application:
+ scenario: Your team is evaluating three different caching strategies (Redis, Memcached,
+ or in-memory caching) for a high-traffic e-commerce platform. The decision will
+ significantly impact performance, scalability, and operational complexity. You
+ need to document this architectural decision for future reference and stakeholder
+ review.
+ anchor_prompt: using MADR
+ paraphrase_prompt: document this architectural decision with a structured approach
+ that explicitly captures all evaluated alternatives and their trade-offs
+ options:
+ A: Create a simple decision log entry with the chosen option (Redis) and a brief
+ rationale, then store it in the project wiki for easy access and updates.
+ B: Document the decision with sections for context, decision drivers, all three
+ caching options as considered alternatives, pros/cons analysis for each, and
+ the final outcome with justification.
+ C: Write a comprehensive technical specification document detailing the implementation
+ approach for Redis, including configuration parameters, monitoring setup,
+ and deployment procedures.
+ D: Record the decision in a structured format focusing primarily on the chosen
+ solution's benefits and implementation details, with minimal coverage of rejected
+ alternatives.
+ correct: B
diff --git a/evaluations/specs/mece.yaml b/evaluations/specs/mece.yaml
new file mode 100644
index 0000000..ece104e
--- /dev/null
+++ b/evaluations/specs/mece.yaml
@@ -0,0 +1,33 @@
+anchor: mece
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes the "MECE Principle"?
+
+ '
+ options:
+ A: Prioritizing requirements into Must/Should/Could/Won't categories
+ B: Structuring categories so they do not overlap and collectively cover all
+ possibilities
+ C: Presenting the conclusion first, then organizing supporting arguments hierarchically
+ D: Decomposing work into independent, negotiable, and testable user stories
+ correct: B
+ application:
+ scenario: Your team is designing a new e-commerce platform and needs to organize
+ the main functional areas into microservices. The platform must handle user
+ management, product catalog, shopping cart, order processing, payment handling,
+ inventory tracking, and customer support features.
+ anchor_prompt: using MECE Principle
+ paraphrase_prompt: How should you organize these functional areas to ensure complete
+ coverage with no overlapping responsibilities between services?
+ options:
+ A: 'Group by user-facing vs backend services: (User Management, Product Catalog,
+ Shopping Cart) and (Order Processing, Payment, Inventory, Support)'
+ B: 'Organize by business capability: User Service, Catalog Service, Cart Service,
+ Order Service, Payment Service, Inventory Service, Support Service'
+ C: 'Structure by data access patterns: Read-heavy services (Catalog, Support)
+ and Write-heavy services (User, Cart, Order, Payment, Inventory)'
+ D: 'Arrange by development team expertise: Core services (User, Product, Cart),
+ Transaction services (Order, Payment), and Operations services (Inventory,
+ Support)'
+ correct: B
diff --git a/evaluations/specs/morphological-box.yaml b/evaluations/specs/morphological-box.yaml
new file mode 100644
index 0000000..d6b3728
--- /dev/null
+++ b/evaluations/specs/morphological-box.yaml
@@ -0,0 +1,39 @@
+anchor: morphological-box
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "Morphological Box"?
+
+ '
+ options:
+ A: Systematically evaluate and score alternative solutions against weighted
+ criteria using a decision matrix approach
+ B: Break complex problem into independent parameters/dimensions; identify possible
+ values/options for each parameter
+ C: Create mutually exclusive and collectively exhaustive categories to ensure
+ complete problem space coverage without overlap
+ D: Generate creative solutions by combining random elements from different domains
+ through structured brainstorming techniques
+ correct: B
+ application:
+ scenario: 'Your team is designing a new API gateway solution and needs to explore
+ all possible architectural combinations. There are multiple independent dimensions
+ to consider: authentication methods (OAuth2, JWT, API keys, mTLS), rate limiting
+ strategies (token bucket, sliding window, fixed window), storage backends (Redis,
+ PostgreSQL, DynamoDB), and deployment models (containerized, serverless, VM-based).'
+ anchor_prompt: using Morphological Box
+ paraphrase_prompt: What systematic approach should you take to ensure you've considered
+ all viable architectural combinations before making design decisions?
+ options:
+ A: Focus on the most critical dimension first, select the best option for that
+ dimension, then optimize the remaining dimensions around that choice to reduce
+ complexity
+ B: Create a matrix with each dimension as a column, list all possible options
+ for each dimension as rows, then systematically evaluate combinations while
+ filtering out infeasible ones
+ C: Conduct stakeholder interviews to determine preferences for each dimension,
+ then use weighted scoring to rank the top three combinations based on business
+ priorities
+ D: Research industry best practices for each dimension independently, then combine
+ the most popular choices from each category to create a proven solution
+ correct: B
diff --git a/evaluations/specs/moscow.yaml b/evaluations/specs/moscow.yaml
new file mode 100644
index 0000000..9940477
--- /dev/null
+++ b/evaluations/specs/moscow.yaml
@@ -0,0 +1,44 @@
+anchor: moscow
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "MoSCoW"?
+
+ '
+ options:
+ A: A visual mapping technique that organizes user stories chronologically to
+ identify gaps and prioritize features based on user journey stages and business
+ value delivery.
+ B: Non-negotiable requirements essential for the current delivery; without them
+ the solution is unusable or unsafe; important requirements that are not vital;
+ painful to leave out but the solution is still viable without them
+ C: A collaborative workshop method for creating shared understanding of project
+ scope by mapping stakeholders, impacts, and deliverables against strategic
+ business objectives.
+ D: A risk assessment framework that categorizes project uncertainties into severity
+ levels to determine mitigation strategies and contingency planning approaches
+ for delivery teams.
+ correct: B
+ application:
+ scenario: Your agile team has 15 user stories estimated at 120 story points for
+ the next 3-week sprint, but your velocity is only 80 points. The product owner
+ needs to decide which stories to include while ensuring stakeholders understand
+ what won't be delivered.
+ anchor_prompt: using MoSCoW
+ paraphrase_prompt: How should you categorize and communicate the stories to stakeholders
+ for this sprint?
+ options:
+ A: Rank all 15 stories from 1-15 by business value, select the top-ranked stories
+ that fit within 80 points, and inform stakeholders that lower-ranked items
+ are deferred to future sprints.
+ B: Categorize stories into Must have (critical for sprint goal), Should have
+ (important but not essential), Could have (nice to have), and Won't have this
+ sprint (explicitly out of scope), then select from each category to fit 80
+ points.
+ C: Group stories by feature area, estimate the effort for each group, select
+ complete feature groups that fit within 80 points, and communicate to stakeholders
+ which feature areas are postponed.
+ D: Sort stories by technical complexity and business impact using a 2x2 matrix,
+ prioritize high-impact low-complexity items first, and explain to stakeholders
+ which quadrants won't be addressed this sprint.
+ correct: B
diff --git a/evaluations/specs/mutation-testing.yaml b/evaluations/specs/mutation-testing.yaml
new file mode 100644
index 0000000..d90898b
--- /dev/null
+++ b/evaluations/specs/mutation-testing.yaml
@@ -0,0 +1,35 @@
+anchor: mutation-testing
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "Mutation Testing"?
+
+ '
+ options:
+ A: Analyze genetic algorithms and evolutionary programming techniques to optimize
+ software performance through iterative code modifications
+ B: Evaluate how effective tests are at detecting bugs; deliberately introduce
+ small, syntactic changes (mutants) into source code
+ C: Test software behavior under various input combinations by systematically
+ changing configuration parameters and environmental conditions
+ D: Verify code stability by introducing controlled runtime errors and monitoring
+ how the system handles unexpected failures
+ correct: B
+ application:
+ scenario: Your team has achieved 95% code coverage on a critical payment processing
+ module, but recent production bugs have slipped through despite the high coverage
+ numbers. Management is questioning whether the existing test suite is actually
+ effective at catching defects.
+ anchor_prompt: using Mutation Testing
+ paraphrase_prompt: to evaluate whether your tests can actually detect bugs rather
+ than just execute code
+ options:
+ A: Add more unit tests to increase code coverage to 100% and focus on testing
+ edge cases that weren't previously covered
+ B: Introduce small deliberate code changes like flipping operators or removing
+ statements, then run your test suite to see which changes go undetected
+ C: Perform static code analysis to identify potential vulnerabilities and code
+ smells that could lead to production defects
+ D: Implement property-based testing to generate random inputs and verify that
+ system invariants hold across all scenarios
+ correct: B
diff --git a/evaluations/specs/negative-control.yaml b/evaluations/specs/negative-control.yaml
new file mode 100644
index 0000000..b7664a2
--- /dev/null
+++ b/evaluations/specs/negative-control.yaml
@@ -0,0 +1,17 @@
+anchor: negative-control
+tier: null
+
+# Negative control: a fabricated anchor that does not exist.
+# Tests whether the model distinguishes real anchors from invented ones.
+# Correct answer is D ("not an established methodology").
+
+questions:
+ recognition:
+ question: |
+ "Müller's Inversion Principle" is:
+ options:
+ A: "A testing methodology that reverses test execution order to detect hidden state dependencies"
+ B: "An architecture pattern that inverts control flow between framework and application code"
+ C: "A requirements technique that starts from the desired outcome and works backward to inputs"
+ D: "Not an established methodology — this term does not refer to a recognized concept"
+ correct: D
diff --git a/evaluations/specs/nelson-rules.yaml b/evaluations/specs/nelson-rules.yaml
new file mode 100644
index 0000000..ccd9bea
--- /dev/null
+++ b/evaluations/specs/nelson-rules.yaml
@@ -0,0 +1,36 @@
+anchor: nelson-rules
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "Nelson Rules"?
+
+ '
+ options:
+ A: Two consecutive points beyond 2σ (Warning); 7 alternating points above and
+ below the mean (oscillation)
+ B: One point beyond 3σ (Outlier); 9 consecutive points on the same side of the
+ mean (shift/bias)
+ C: Four points in a row increasing or decreasing (Trend); 6 consecutive points
+ within 1σ (Clustering)
+ D: Eight points beyond 1σ on alternating sides (Variance); 5 consecutive points
+ forming a pattern (Sequence)
+ correct: B
+ application:
+ scenario: Your team is monitoring API response times using a control chart with
+ mean=200ms and standard deviation=50ms. Over the past 15 data points, you notice
+ that 14 consecutive measurements have been alternating between values above
+ and below the mean (e.g., 220ms, 180ms, 230ms, 170ms, etc.), creating a zigzag
+ pattern.
+ anchor_prompt: using Nelson Rules
+ paraphrase_prompt: to systematically detect whether this alternating pattern indicates
+ a non-random process issue that requires investigation
+ options:
+ A: Focus only on whether any individual points exceed 3 standard deviations
+ from the mean, as this is the most reliable indicator of process problems
+ B: Flag this as a special cause violation since 14 alternating points up and
+ down indicates systematic oscillation that suggests an assignable cause
+ C: Wait for at least 9 consecutive points on the same side of the mean before
+ concluding there is a process shift requiring attention
+ D: Check if 4 out of 5 consecutive points fall beyond 1 standard deviation on
+ the same side before taking corrective action
+ correct: B
diff --git a/evaluations/specs/owasp-top-10.yaml b/evaluations/specs/owasp-top-10.yaml
new file mode 100644
index 0000000..9d8c85a
--- /dev/null
+++ b/evaluations/specs/owasp-top-10.yaml
@@ -0,0 +1,42 @@
+anchor: owasp-top-10
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "OWASP Top 10"?
+
+ '
+ options:
+ A: A regularly updated list of the ten most critical web application security
+ risks, covering threats like injection, broken access control, and cryptographic
+ failures
+ B: A comprehensive framework for assessing software reliability and safety integrity
+ levels in critical systems with four distinct SIL classifications
+ C: An industry standard methodology for evaluating cybersecurity maturity across
+ organizational processes with five progressive capability levels
+ D: A systematic approach to privacy threat modeling that identifies linkability,
+ identifiability, and disclosure risks in software systems
+ correct: A
+ application:
+ scenario: Your team is conducting a security review of a web application that
+ handles customer financial data. During the assessment, you discover that user
+ input from web forms is directly concatenated into SQL queries without validation,
+ the application uses default database credentials, and sensitive customer data
+ is stored in plain text. The development team needs to prioritize which security
+ issues to address first.
+ anchor_prompt: using OWASP Top 10
+ paraphrase_prompt: prioritize these security vulnerabilities based on established
+ web application security risk categories
+ options:
+ A: Focus on the plain text storage issue first since data encryption is the
+ foundation of all security, then address input validation, and finally update
+ default credentials during the next major release cycle.
+ B: Address the SQL injection vulnerability first (A03 - Injection), then fix
+ the plain text storage (A02 - Cryptographic Failures), and finally remediate
+ the default credentials (A05 - Security Misconfiguration).
+ C: Implement comprehensive logging and monitoring capabilities first to detect
+ future attacks, then gradually address the technical vulnerabilities based
+ on development team availability and sprint capacity.
+ D: Prioritize fixing the default credentials first since they provide the easiest
+ attack vector, then address data encryption, and finally implement input validation
+ as a long-term security enhancement.
+ correct: B
diff --git a/evaluations/specs/plain-english-strunk-white.yaml b/evaluations/specs/plain-english-strunk-white.yaml
new file mode 100644
index 0000000..812b902
--- /dev/null
+++ b/evaluations/specs/plain-english-strunk-white.yaml
@@ -0,0 +1,50 @@
+anchor: plain-english-strunk-white
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "Plain English according to Strunk
+ & White"?
+
+ '
+ options:
+ A: Write with a clear hierarchical structure where conclusions come first followed
+ by supporting arguments, using logical groupings that allow readers to understand
+ the main point before diving into details — start with the answer, then provide
+ the reasoning that led to that conclusion.
+ B: Every word in a sentence should serve a purpose; cut words that add bulk
+ without adding meaning — "the fact that" → "that", "owing to the fact that"
+ → "since"; prefer active constructions over passive; active voice is more
+ direct, vigorous, and concise — "the dog bit the man" not "the man was bitten
+ by the dog"
+ C: Use simple, everyday vocabulary and short sentences that can be understood
+ by the general public, avoiding technical jargon, complex grammatical structures,
+ and industry-specific terminology — choose 'help' over 'facilitate', 'use'
+ over 'utilize', and 'show' over 'demonstrate'.
+ D: Organize information in a bottom-line-up-front approach where the most important
+ message appears at the beginning, followed by supporting details arranged
+ in decreasing order of importance — present key findings first, then provide
+ the analysis and background that supports those conclusions.
+ correct: B
+ application:
+ scenario: A software engineer is reviewing API documentation that describes error
+ handling procedures. The current draft contains several sentences that feel
+ wordy and unclear, making it difficult for developers to quickly understand
+ what actions to take when errors occur.
+ anchor_prompt: using Plain English according to Strunk & White
+ paraphrase_prompt: to make the error handling documentation as clear, direct,
+ and concise as possible for developer users
+ options:
+ A: Add more detailed explanations and qualifying phrases like 'generally speaking'
+ and 'in most cases' to ensure developers understand the nuanced conditions
+ under which different error handling approaches might be considered appropriate.
+ B: Remove unnecessary words, use active voice, and place the most important
+ action at the end of each sentence. Replace 'In the event that an error occurs'
+ with 'When an error occurs' and 'The system will be restarted by the administrator'
+ with 'The administrator restarts the system.'
+ C: Include comprehensive background context about why each error might occur,
+ using rich descriptive language and multiple adjectives to paint a complete
+ picture of potential system states and failure modes.
+ D: Restructure sentences to use passive voice consistently, add transitional
+ phrases between concepts, and include apologetic language like 'unfortunately'
+ and 'regrettably' to acknowledge the inconvenience of errors.
+ correct: B
diff --git a/evaluations/specs/prd.yaml b/evaluations/specs/prd.yaml
new file mode 100644
index 0000000..d25c7ce
--- /dev/null
+++ b/evaluations/specs/prd.yaml
@@ -0,0 +1,41 @@
+anchor: prd
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "PRD"?
+
+ '
+ options:
+ A: Systematic prioritization framework using Must have, Should have, Could have,
+ and Won't have categories to rank feature requirements based on business value
+ and urgency
+ B: Clear articulation of the problem to be solved and the target users; measurable
+ outcomes that define what "done" looks like (kpis, okrs)
+ C: Visual representation of user activities and tasks arranged chronologically
+ to identify gaps, priorities, and release planning opportunities for product
+ development
+ D: Structured template format capturing user needs as role-based scenarios with
+ acceptance criteria to ensure requirements are testable and implementable
+ by development teams
+ correct: B
+ application:
+ scenario: Your startup is building a new mobile app for freelance project management.
+ The engineering team keeps asking clarifying questions about features, the design
+ team is unsure about user workflows, and marketing needs to understand the target
+ audience. Stakeholders have conflicting ideas about what should be included
+ in the first release.
+ anchor_prompt: using PRD
+ paraphrase_prompt: What document should you create to align all teams and provide
+ a comprehensive foundation for product development?
+ options:
+ A: Create a technical architecture document that outlines the system components,
+ database schema, and API specifications to guide the engineering team's implementation
+ decisions.
+ B: Write a comprehensive document that defines the problem statement, target
+ users, success metrics, functional requirements, scope boundaries, and constraints
+ to align all stakeholders.
+ C: Develop a project timeline with detailed user stories, acceptance criteria,
+ and sprint planning to coordinate development activities across all teams.
+ D: Conduct stakeholder interviews and create a competitive analysis report with
+ market research findings to inform strategic product positioning decisions.
+ correct: B
diff --git a/evaluations/specs/problem-space-nvc.yaml b/evaluations/specs/problem-space-nvc.yaml
new file mode 100644
index 0000000..fb99de0
--- /dev/null
+++ b/evaluations/specs/problem-space-nvc.yaml
@@ -0,0 +1,47 @@
+anchor: problem-space-nvc
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "Nonviolent Communication (Rosenberg)"?
+
+ '
+ options:
+ A: A structured approach to code reviews that emphasizes constructive feedback
+ through specific examples and actionable suggestions rather than general criticism.
+ B: Concrete, objective facts without evaluation or judgment. "The deploy failed
+ three times this week" instead of "The deploy always fails."; emotions arising
+ from observations. "i feel frustrated" instead of "this is frustrating."
+ C: A conflict resolution methodology that focuses on identifying root causes
+ of team disagreements and establishing clear communication protocols between
+ stakeholders.
+ D: An agile communication framework that prioritizes transparent status updates
+ and eliminates ambiguous language in sprint retrospectives and daily standups.
+ correct: B
+ application:
+ scenario: 'A product manager sends an email to the development team: ''The search
+ feature is completely broken and users are complaining constantly. You developers
+ never test anything properly before releasing. This is unacceptable and needs
+ to be fixed immediately.'' The team lead wants to respond in a way that addresses
+ the concerns while maintaining a collaborative relationship.'
+ anchor_prompt: using Nonviolent Communication (Rosenberg)
+ paraphrase_prompt: Transform this response to follow a structured approach that
+ separates facts from judgments, acknowledges emotions, identifies underlying
+ needs, and makes specific actionable requests.
+ options:
+ A: I understand you're frustrated with the search feature. However, saying we
+ 'never test properly' isn't accurate - we do have testing procedures. Let's
+ schedule a meeting to discuss how we can improve our QA process and address
+ the user complaints more systematically.
+ B: I notice the search feature has generated 15 user complaints this week. I
+ feel concerned because I value delivering quality software that meets user
+ needs. Could we schedule a 30-minute meeting tomorrow to review the specific
+ issues and create an action plan together?
+ C: Thanks for bringing this to our attention. You're right that the search feature
+ has issues and we need to address them quickly. I'll have the team prioritize
+ this as our top bug fix and we'll provide daily updates on our progress until
+ it's resolved.
+ D: I appreciate your passion for quality, and I share your concern about user
+ experience. While the search feature does have problems, let's focus on solutions
+ rather than blame. What specific search scenarios are failing, and what would
+ success look like from your perspective?
+ correct: B
diff --git a/evaluations/specs/property-based-testing.yaml b/evaluations/specs/property-based-testing.yaml
new file mode 100644
index 0000000..4330600
--- /dev/null
+++ b/evaluations/specs/property-based-testing.yaml
@@ -0,0 +1,36 @@
+anchor: property-based-testing
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "Property-Based Testing"?
+
+ '
+ options:
+ A: Testing software by validating that object properties and attributes maintain
+ expected values throughout execution
+ B: Invariants that should always hold; automatic test data creation
+ C: A testing methodology that focuses on verifying ownership and access rights
+ of system resources and data
+ D: Unit testing approach that examines individual class properties and their
+ getter/setter method implementations
+ correct: B
+ application:
+ scenario: You're developing a financial calculator library with functions for
+ compound interest, loan payments, and currency conversions. The library will
+ be used by multiple client applications, and accuracy is critical since even
+ small rounding errors could accumulate into significant financial discrepancies
+ over time.
+ anchor_prompt: using Property-Based Testing
+ paraphrase_prompt: What testing approach would best validate that your financial
+ calculations maintain mathematical correctness across all possible input ranges?
+ options:
+ A: Write comprehensive unit tests covering typical financial scenarios like
+ 30-year mortgages, common interest rates, and standard loan amounts
+ B: Define mathematical invariants like 'interest calculations should be commutative'
+ and generate thousands of random valid inputs to verify these properties always
+ hold
+ C: Create integration tests that simulate real user workflows by testing complete
+ financial scenarios from input to final calculation output
+ D: Implement regression tests using historical financial data from previous
+ system versions to ensure calculations remain consistent over time
+ correct: B
diff --git a/evaluations/specs/pyramid-principle.yaml b/evaluations/specs/pyramid-principle.yaml
new file mode 100644
index 0000000..3343a5d
--- /dev/null
+++ b/evaluations/specs/pyramid-principle.yaml
@@ -0,0 +1,42 @@
+anchor: pyramid-principle
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "Pyramid Principle according
+ to Barbara Minto"?
+
+ '
+ options:
+ A: Hierarchical software architecture pattern where base components support
+ higher-level modules; follows dependency inversion with abstractions at the
+ top layer
+ B: Single key message at the top of the pyramid; situation → complication →
+ question → answer structure for setting context
+ C: Project management framework organizing tasks in ascending priority levels;
+ uses risk assessment → resource allocation → timeline planning → execution
+ phases
+ D: Information architecture methodology structuring content from broad categories
+ to specific details; applies user journey mapping with navigation flow optimization
+ correct: B
+ application:
+ scenario: Your engineering team has discovered a critical security vulnerability
+ in the production system that requires immediate attention and significant resources
+ to fix. The CTO has requested a 10-minute presentation to the executive team
+ explaining the situation and recommending next steps.
+ anchor_prompt: using Pyramid Principle according to Barbara Minto
+ paraphrase_prompt: How should you structure your presentation to maximize clarity
+ and executive buy-in for your recommended solution?
+ options:
+ A: Start with technical details of the vulnerability, explain how it was discovered,
+ walk through potential attack vectors, then conclude with your recommended
+ fix and resource requirements.
+ B: Lead with your recommendation to allocate resources for immediate patching,
+ then explain the current security risk situation, the complications it creates
+ for business operations, and supporting evidence for your proposed solution.
+ C: Present three possible solutions with pros and cons for each, provide detailed
+ technical analysis of the vulnerability, then ask the executives to vote on
+ which approach they prefer.
+ D: Begin by establishing credibility through your team's security expertise,
+ chronologically explain how the vulnerability was discovered, detail the investigation
+ process, then present findings and recommendations.
+ correct: B
diff --git a/evaluations/specs/sanity-check.yaml b/evaluations/specs/sanity-check.yaml
new file mode 100644
index 0000000..c0b7c31
--- /dev/null
+++ b/evaluations/specs/sanity-check.yaml
@@ -0,0 +1,17 @@
+anchor: sanity-check
+tier: null
+
+# Sanity check: none of the options is correct (the answer is 42).
+# Every model MUST score 0% because it will pick a wrong option.
+# If any model scores >0%, the scoring pipeline has a bug.
+
+questions:
+ recognition:
+ question: |
+ What is the Answer to the Ultimate Question of Life, the Universe, and Everything?
+ options:
+ A: "17"
+ B: "23"
+ C: "99"
+ D: "256"
+ correct: X
diff --git a/evaluations/specs/semantic-versioning.yaml b/evaluations/specs/semantic-versioning.yaml
new file mode 100644
index 0000000..0bc881e
--- /dev/null
+++ b/evaluations/specs/semantic-versioning.yaml
@@ -0,0 +1,32 @@
+anchor: semantic-versioning
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "Semantic Versioning (SemVer)"?
+
+ '
+ options:
+ A: Version control system that automatically tracks semantic changes in code
+ structure and meaning across different branches
+ B: A versioning scheme using MAJOR.MINOR.PATCH where MAJOR signals breaking
+ changes, MINOR signals new features, and PATCH signals bug fixes
+ C: Development methodology that prioritizes meaningful variable and function
+ naming conventions to improve code readability
+ D: Documentation standard that requires detailed explanations of API functionality
+ and business logic for each software release
+ correct: B
+ application:
+ scenario: You maintain a JavaScript authentication library that currently has
+ version 2.3.1. You need to release an update that adds a new optional parameter
+ to an existing login method, includes several bug fixes for token validation,
+ and removes a deprecated method that was marked for removal six months ago.
+ anchor_prompt: using Semantic Versioning (SemVer)
+ paraphrase_prompt: determine the appropriate version number for this release that
+ properly communicates the impact of changes to library consumers
+ options:
+ A: 2.3.2 - since the new parameter is optional and doesn't break existing code
+ B: 3.0.0 - because removing the deprecated method constitutes a breaking change
+ C: 2.4.0 - to reflect the addition of new functionality with the optional parameter
+ D: 2.3.1-update.1 - using pre-release notation to indicate multiple types of
+ changes
+ correct: B
diff --git a/evaluations/specs/socratic-method.yaml b/evaluations/specs/socratic-method.yaml
new file mode 100644
index 0000000..3d21b8f
--- /dev/null
+++ b/evaluations/specs/socratic-method.yaml
@@ -0,0 +1,35 @@
+anchor: socratic-method
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "Socratic Method"?
+
+ '
+ options:
+ A: Systematic approach to software development that emphasizes iterative refinement
+ through structured peer review and collaborative problem-solving sessions
+ B: Lead learners to insights through questions rather than direct instruction;
+ cross-examination technique to expose contradictions in beliefs
+ C: Teaching methodology that breaks complex problems into smaller components
+ and builds understanding through sequential presentation of foundational concepts
+ D: Architectural pattern that separates concerns by organizing code into distinct
+ layers with well-defined interfaces and dependency injection principles
+ correct: B
+ application:
+ scenario: During a code review, a senior developer notices that a junior developer
+ has implemented a caching solution that could cause data consistency issues
+ in a distributed system. The junior developer seems confident in their approach
+ and hasn't considered the potential problems.
+ anchor_prompt: using Socratic Method
+ paraphrase_prompt: to help the junior developer discover the potential issues
+ through guided inquiry rather than direct criticism
+ options:
+ A: Point out the specific data consistency problems and explain why the current
+ caching approach won't work in a distributed environment.
+ B: Ask questions like 'What happens when multiple services update the same cached
+ data?' and 'How does your cache handle network partitions?'
+ C: Suggest they research distributed caching patterns and come back with alternative
+ solutions before proceeding with the implementation.
+ D: Approve the code for now but schedule a follow-up meeting to discuss distributed
+ systems architecture and caching strategies.
+ correct: B
diff --git a/evaluations/specs/sota.yaml b/evaluations/specs/sota.yaml
new file mode 100644
index 0000000..0d22440
--- /dev/null
+++ b/evaluations/specs/sota.yaml
@@ -0,0 +1,36 @@
+anchor: sota
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "SOTA (State-of-the-Art)"?
+
+ '
+ options:
+ A: A standardized framework for documenting software architecture decisions
+ and technical specifications across development teams
+ B: Focus on the most current, cutting-edge methods and techniques; reference
+ current research papers, benchmarks, and empirical results
+ C: A methodology for systematic testing and validation of software systems against
+ predefined quality assurance benchmarks
+ D: An agile development approach that emphasizes iterative prototyping and continuous
+ integration of emerging technologies
+ correct: B
+ application:
+ scenario: Your team is building a new document search system for a legal firm
+ that needs to handle complex queries across millions of legal documents. The
+ current keyword-based search is inadequate, and you need to implement semantic
+ search capabilities that can understand legal terminology and context.
+ anchor_prompt: using SOTA (State-of-the-Art)
+ paraphrase_prompt: What approach would ensure you're implementing the most current
+ and highest-performing solution based on recent research and benchmarks?
+ options:
+ A: Implement a well-documented TF-IDF approach with legal domain customizations,
+ as it's proven reliable and easier to maintain than newer experimental methods.
+ B: Research recent papers on semantic search benchmarks, compare transformer-based
+ embedding models like BGE and E5, and implement the approach showing best
+ performance on legal document retrieval tasks.
+ C: Use the same semantic search architecture that worked well in your previous
+ project, making minor adjustments for the legal domain and document types.
+ D: Follow the semantic search tutorial from the framework documentation, as
+ it represents the vendor's recommended best practices for production systems.
+ correct: B
diff --git a/evaluations/specs/spc.yaml b/evaluations/specs/spc.yaml
new file mode 100644
index 0000000..13aefc9
--- /dev/null
+++ b/evaluations/specs/spc.yaml
@@ -0,0 +1,35 @@
+anchor: spc
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "SPC (Statistical Process Control)"?
+
+ '
+ options:
+ A: Structured programming methodology that controls code execution flow through
+ systematic elimination of goto statements and unstructured branching
+ B: Systematic statistical monitoring of running processes; inherent, random
+ fluctuation — stable and predictable
+ C: Statistical performance computing framework that optimizes system resources
+ by analyzing computational workload patterns and predicting bottlenecks
+ D: Software process certification standard that validates development methodologies
+ through rigorous documentation and compliance verification procedures
+ correct: B
+ application:
+ scenario: Your web application's API response times have been averaging 250ms
+ over the past month, but this week you've noticed some responses taking 400-500ms.
+ The development team wants to determine if this represents a real performance
+ degradation that needs investigation or just normal fluctuation.
+ anchor_prompt: using SPC (Statistical Process Control)
+ paraphrase_prompt: to systematically distinguish between normal process variation
+ and signals that indicate a real change requiring intervention
+ options:
+ A: Set a fixed threshold at 300ms and alert whenever any single response exceeds
+ this limit, then investigate each alert individually
+ B: Plot response times on a control chart with calculated control limits, then
+ apply detection rules to identify when the process shows special cause variation
+ C: Compare this week's average response time to last week's using a t-test and
+ investigate if the difference is statistically significant
+ D: Monitor the 95th percentile response time and trigger an investigation whenever
+ it increases by more than 10% from the baseline
+ correct: B
diff --git a/evaluations/specs/stride.yaml b/evaluations/specs/stride.yaml
new file mode 100644
index 0000000..69b9e74
--- /dev/null
+++ b/evaluations/specs/stride.yaml
@@ -0,0 +1,41 @@
+anchor: stride
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "STRIDE Threat Model"?
+
+ '
+ options:
+ A: A systematic approach for identifying and categorizing the top ten most critical
+ web application security risks; maintained by OWASP foundation; focuses on
+ injection flaws, broken authentication, and sensitive data exposure vulnerabilities
+ B: Impersonating another user, process, or system to gain unauthorized access;
+ mitigated by strong authentication; unauthorized modification of data in transit
+ or at rest; mitigated by integrity controls, digital signatures, and access
+ controls
+ C: A defensive security framework that assumes breach scenarios and implements
+ zero-trust principles; emphasizes continuous verification, least privilege
+ access, and micro-segmentation to limit lateral movement within networks
+ D: A risk assessment methodology for evaluating security controls in regulated
+ environments; provides quantitative scoring based on asset criticality, threat
+ likelihood, and business impact to prioritize remediation efforts
+ correct: B
+ application:
+ scenario: Your team is designing a new online banking application that handles
+ user authentication, financial transactions, and account data. During the security
+ design review, you need to systematically identify potential security threats
+ that could affect different components of the system.
+ anchor_prompt: using STRIDE Threat Model
+ paraphrase_prompt: systematically categorize the security threats by analyzing
+ each component against six fundamental threat categories
+ options:
+ A: Focus primarily on external attack vectors like SQL injection and cross-site
+ scripting, then assess the likelihood and business impact of each vulnerability
+ type
+ B: Examine each system component for Spoofing, Tampering, Repudiation, Information
+ Disclosure, Denial of Service, and Elevation of Privilege threats
+ C: Create attack trees starting from high-value assets, then trace backward
+ through all possible attack paths that could compromise those assets
+ D: Map all system entry points and data flows, then apply a risk rating matrix
+ based on confidentiality, integrity, and availability requirements
+ correct: B
diff --git a/evaluations/specs/swot.yaml b/evaluations/specs/swot.yaml
new file mode 100644
index 0000000..ef6edee
--- /dev/null
+++ b/evaluations/specs/swot.yaml
@@ -0,0 +1,45 @@
+anchor: swot
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "SWOT"?
+
+ '
+ options:
+ A: A prioritization framework that categorizes requirements into Should have,
+ Won't have this time, Optional features, and Time-critical deliverables to
+ manage project scope effectively
+ B: Internal positive attributes and resources that give the subject an advantage
+ over others; internal negative attributes or limitations that place the subject
+ at a disadvantage relative to others
+ C: A visual mapping technique that plots the evolution of components along a
+ value chain from genesis to commodity to identify strategic positioning and
+ dependencies
+ D: A decision-making matrix that systematically evaluates multiple alternatives
+ against weighted criteria by scoring each option to determine the optimal
+ solution objectively
+ correct: B
+ application:
+ scenario: Your team is evaluating whether to migrate from a monolithic architecture
+ to microservices for your e-commerce platform. The monolith has served you well
+ for 3 years but scaling challenges are emerging. You need to present a comprehensive
+ analysis to stakeholders covering all key factors that could influence this
+ architectural decision.
+ anchor_prompt: using SWOT
+ paraphrase_prompt: What framework should you use to systematically evaluate both
+ internal capabilities and external factors that could impact this architectural
+ migration decision?
+ options:
+ A: Create a decision matrix listing technical requirements as rows and architecture
+ options as columns, scoring each combination on feasibility and impact to
+ determine the optimal choice.
+ B: Analyze internal strengths and weaknesses of your current capabilities alongside
+ external opportunities and threats in the market to create a comprehensive
+ strategic assessment.
+ C: Map out the current value chain and evolution stages of each system component
+ to identify which parts are commodities versus differentiators before making
+ architectural changes.
+ D: Categorize all migration requirements into must-have, should-have, could-have,
+ and won't-have priorities to focus development effort on the most critical
+ architectural changes first.
+ correct: B
diff --git a/evaluations/specs/tdd-chicago-school.yaml b/evaluations/specs/tdd-chicago-school.yaml
new file mode 100644
index 0000000..0370501
--- /dev/null
+++ b/evaluations/specs/tdd-chicago-school.yaml
@@ -0,0 +1,39 @@
+anchor: tdd-chicago-school
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "TDD, Chicago School"?
+
+ '
+ options:
+ A: Write tests first, then implement code to pass those tests; focus on behavior
+ verification through extensive mocking of all dependencies
+ B: Verify the state of objects after operations; use real objects whenever possible;
+ mock only external dependencies
+ C: Design software architecture by defining interfaces first; use dependency
+ injection to isolate components and enable comprehensive unit testing
+ D: Develop code in small iterations with continuous refactoring; emphasize pair
+ programming and collective code ownership practices
+ correct: B
+ application:
+ scenario: You're developing an e-commerce order processing system with complex
+ business rules for discounts, taxes, and inventory management. The team needs
+ to implement the core OrderCalculator class that handles pricing logic, integrating
+ with external payment and inventory services.
+ anchor_prompt: using TDD, Chicago School
+ paraphrase_prompt: What approach should you take to develop and test this core
+ business logic component?
+ options:
+ A: Start by mocking all dependencies including the payment service, inventory
+ service, and database connections, then write tests that verify method calls
+ and interactions between components.
+ B: Begin with tests for the core pricing calculations using real domain objects,
+ mock only the external payment and inventory services, and let the design
+ emerge through refactoring cycles.
+ C: Create comprehensive interface definitions and mock implementations for all
+ collaborating objects first, then build the OrderCalculator by specifying
+ expected behavior through interaction testing.
+ D: Write integration tests that cover the entire order flow from UI to database,
+ then extract unit tests for individual components based on the integration
+ test scenarios.
+ correct: B
diff --git a/evaluations/specs/tdd-london-school.yaml b/evaluations/specs/tdd-london-school.yaml
new file mode 100644
index 0000000..56efbbb
--- /dev/null
+++ b/evaluations/specs/tdd-london-school.yaml
@@ -0,0 +1,40 @@
+anchor: tdd-london-school
+tier: 3
+
+questions:
+ recognition:
+ question: |
+ Which of the following best describes "TDD, London School"?
+ options:
+ A: State-based testing with real collaborating objects and minimal mocking
+ B: Outside-in development with mock-heavy, interaction-based testing
+ C: Acceptance testing using Given/When/Then scenario specifications
+ D: Exploratory testing focused on edge cases and unspecified behavior
+ correct: B
+
+ application:
+ scenario: |
+ You are reviewing a pull request. The code adds a new OrderService
+ that calls PaymentGateway and InventoryService.
+ What is your primary testing recommendation?
+ anchor_prompt: "using TDD, London School principles"
+ paraphrase_prompt: "Write isolated tests for the service layer"
+ options:
+ A: Write a test that processes a real order end-to-end through all three services
+ B: Write a test that mocks PaymentGateway and InventoryService to verify OrderService interactions
+ C: Write a test that checks the database state after processing an order
+ D: Skip unit tests and write an integration test with a test database
+ correct: B
+
+ consistency:
+ variants:
+ - 'Which proponent is most closely associated with "TDD, London School"?'
+ - 'Which proponent is most closely associated with "Mockist TDD"?'
+ - 'Which proponent is most closely associated with "Outside-In TDD"?'
+ language_variant: 'Welcher Proponent wird am engsten mit "TDD, London School" assoziiert?'
+ options:
+ A: Kent Beck
+ B: Steve Freeman
+ C: Dan North
+ D: Martin Fowler
+ correct: B
diff --git a/evaluations/specs/testing-pyramid.yaml b/evaluations/specs/testing-pyramid.yaml
new file mode 100644
index 0000000..77ff27c
--- /dev/null
+++ b/evaluations/specs/testing-pyramid.yaml
@@ -0,0 +1,35 @@
+anchor: testing-pyramid
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "Testing Pyramid"?
+
+ '
+ options:
+ A: A hierarchical structure where system tests form the base, integration tests
+ the middle, and unit tests the top layer
+ B: Three layers; more unit tests, fewer e2e tests
+ C: A risk assessment framework that categorizes software defects into three
+ priority levels based on severity impact
+ D: A test organization model where manual testing supports automated testing
+ which supports exploratory testing at the apex
+ correct: B
+ application:
+ scenario: Your team is developing an e-commerce platform and currently has 20
+ unit tests, 50 integration tests, and 80 end-to-end tests. The CI/CD pipeline
+ takes 45 minutes to run, and developers are frustrated with slow feedback on
+ their commits. Management wants to improve development velocity while maintaining
+ quality.
+ anchor_prompt: using Testing Pyramid
+ paraphrase_prompt: What test distribution strategy would best optimize feedback
+ speed while maintaining comprehensive coverage?
+ options:
+ A: Increase all test types proportionally to 40 unit tests, 100 integration
+ tests, and 160 end-to-end tests for better coverage
+ B: Restructure to 200 unit tests, 40 integration tests, and 15 end-to-end tests,
+ moving logic validation to faster test layers
+ C: Focus primarily on integration tests with 30 unit tests, 120 integration
+ tests, and 20 end-to-end tests for balanced coverage
+ D: Maintain current ratios but optimize each test type for speed without changing
+ the overall distribution strategy
+ correct: B
diff --git a/evaluations/specs/timtowtdi.yaml b/evaluations/specs/timtowtdi.yaml
new file mode 100644
index 0000000..e545ac1
--- /dev/null
+++ b/evaluations/specs/timtowtdi.yaml
@@ -0,0 +1,36 @@
+anchor: timtowtdi
+tier: 1
+questions:
+ recognition:
+ question: 'Which of the following best describes "TIMTOWTDI"?
+
+ '
+ options:
+ A: A principle that problems can have multiple equally valid solutions, favoring
+ flexibility over prescription
+ B: A testing strategy that combines multiple test types to maximize coverage
+ C: A design pattern that delegates decisions to the most informed component
+ at runtime
+ D: A refactoring approach that transforms complex code into simpler equivalent
+ forms step by step
+ correct: A
+ application:
+ scenario: 'Your team is implementing user authentication for a web application.
+ Three developers have proposed different approaches: JWT tokens with Redis caching,
+ session-based authentication with database storage, and OAuth integration with
+ a third-party provider. All three solutions meet the technical requirements
+ and security standards.'
+ anchor_prompt: using TIMTOWTDI
+ paraphrase_prompt: How should the team handle this situation where multiple valid
+ technical approaches exist?
+ options:
+ A: Select the most popular industry standard approach to ensure long-term maintainability
+ and reduce technical risk.
+ B: Evaluate each approach's trade-offs in your specific context, discuss the
+ implications with the team, and choose based on your constraints rather than
+ dismissing valid alternatives.
+ C: Choose the approach proposed by the most senior developer to maintain team
+ hierarchy and avoid lengthy technical debates.
+ D: Implement the simplest solution first, then refactor to a more sophisticated
+ approach once you have more data about user requirements.
+ correct: B
diff --git a/evaluations/specs/todotxt-flavoured-markdown.yaml b/evaluations/specs/todotxt-flavoured-markdown.yaml
new file mode 100644
index 0000000..658d26e
--- /dev/null
+++ b/evaluations/specs/todotxt-flavoured-markdown.yaml
@@ -0,0 +1,42 @@
+anchor: todotxt-flavoured-markdown
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "todo.txt-flavoured Markdown"?
+
+ '
+ options:
+ A: A markup syntax that extends standard Markdown with project management features
+ using `@context` and `+project` tags, prioritized by numerical prefixes like
+ `1.`, `2.`, `3.`
+ B: Standard GitHub-flavoured markdown syntax (`- [ ]` uncompleted, `- [x]` completed);
+ uses todo.txt priority notation `(a)`, `(b)`, `(c)` where `(a)` is highest
+ priority
+ C: A documentation format that combines reStructuredText syntax with Kanban-style
+ workflow markers (`TODO:`, `DOING:`, `DONE:`) and uses hashtag priority levels
+ `#high`, `#medium`, `#low`
+ D: An issue tracking notation that merges JIRA-style ticket formatting with
+ plain text using bracketed status indicators `[OPEN]`, `[CLOSED]` and priority
+ weights expressed as `{P1}`, `{P2}`, `{P3}`
+ correct: B
+ application:
+ scenario: Your team is managing multiple feature development streams and bug fixes
+ across different projects. Team members need to track tasks that vary in priority,
+ belong to different projects, require specific tools or contexts, and have various
+ deadlines.
+ anchor_prompt: using todo.txt-flavoured Markdown
+ paraphrase_prompt: How should you structure your task list to combine readable
+ markdown formatting with systematic priority levels, project groupings, context
+ indicators, and searchable metadata?
+ options:
+ A: 'Use standard bullet points with custom formatting like `* HIGH: [Website]
+ Fix login bug - Computer work - Due: Feb 5th` and mark completed items by
+ moving them to a separate section'
+ B: Use checkbox syntax with priority letters, plus-prefixed project tags, at-prefixed
+ contexts, and key:value pairs like `- [ ] (A) Fix login bug +website @computer
+ due:2024-02-05`
+ C: Create separate markdown files for each priority level and use YAML frontmatter
+ to specify project, context, and due dates for each task list
+ D: 'Use numbered lists with emoji indicators for priority (🔥⚡⏰) and hashtag-style
+ tags like `1. 🔥 Fix login bug #website #computer #due-feb-5`'
+ correct: B
diff --git a/evaluations/specs/user-story-mapping.yaml b/evaluations/specs/user-story-mapping.yaml
new file mode 100644
index 0000000..08618d9
--- /dev/null
+++ b/evaluations/specs/user-story-mapping.yaml
@@ -0,0 +1,38 @@
+anchor: user-story-mapping
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "User Story Mapping"?
+
+ '
+ options:
+ A: Visual representation of user personas mapped to specific system requirements
+ and acceptance criteria
+ B: Horizontal arrangement of user activities; high-level tasks users perform
+ C: Hierarchical breakdown of software features organized by technical complexity
+ and development priority
+ D: Sequential workflow diagram showing user interactions and system responses
+ throughout the application lifecycle
+ correct: B
+ application:
+ scenario: Your team is building a new e-commerce mobile app and has collected
+ 47 user stories in the backlog. The product owner is struggling to explain the
+ release strategy to stakeholders, and developers are confused about how individual
+ stories connect to the overall user experience.
+ anchor_prompt: using User Story Mapping
+ paraphrase_prompt: What approach would best help the team visualize the complete
+ user journey and plan incremental releases?
+ options:
+ A: Group stories by technical complexity and implement the easiest ones first,
+ then present a demo to stakeholders showing completed features in order of
+ development difficulty.
+ B: Arrange stories horizontally by user activities in chronological order, then
+ stack them vertically by priority to identify thin slices of end-to-end functionality
+ for each release.
+ C: Create a detailed project timeline with all stories assigned to specific
+ sprints, then hold stakeholder meetings to review the Gantt chart and adjust
+ dates based on feedback.
+ D: Categorize stories by user role and estimate story points for each category,
+ then create a burndown chart to track progress and communicate velocity to
+ stakeholders.
+ correct: B
diff --git a/evaluations/specs/wardley-mapping.yaml b/evaluations/specs/wardley-mapping.yaml
new file mode 100644
index 0000000..818df78
--- /dev/null
+++ b/evaluations/specs/wardley-mapping.yaml
@@ -0,0 +1,38 @@
+anchor: wardley-mapping
+tier: 3
+questions:
+ recognition:
+ question: 'Which of the following best describes "Wardley Mapping"?
+
+ '
+ options:
+ A: Map system dependencies from infrastructure up; requirements → design → implementation
+ → deployment
+ B: Map components from user needs down; genesis → custom → product → commodity
+ C: Map stakeholder relationships outward; internal → partners → customers →
+ market segments
+ D: Map technical debt from legacy systems; identified → prioritized → refactored
+ → modernized
+ correct: B
+ application:
+ scenario: Your fintech startup is deciding whether to build a custom payment processing
+ system, integrate with an existing payment API like Stripe, or partner with
+ a traditional payment processor. The team is debating the strategic implications
+ of each approach for the company's long-term competitive position.
+ anchor_prompt: using Wardley Mapping
+ paraphrase_prompt: What strategic approach should guide this build-vs-buy-vs-partner
+ decision?
+ options:
+ A: Conduct a cost-benefit analysis comparing the total cost of ownership for
+ each option over a 3-year period, then select the lowest-cost solution that
+ meets current technical requirements.
+ B: Map the payment processing component's position on the evolution axis from
+ genesis to commodity, then choose build for genesis/custom stages and buy/partner
+ for product/commodity stages.
+ C: Survey competitors to see what payment solutions they use, then select the
+ same approach as the most successful competitor to ensure market alignment
+ and reduce strategic risk.
+ D: Evaluate each option based on development team capacity and timeline constraints,
+ prioritizing the approach that can be implemented fastest while maintaining
+ acceptable quality standards.
+ correct: B