diff --git a/evaluations/.gitignore b/evaluations/.gitignore new file mode 100644 index 0000000..0d20b64 --- /dev/null +++ b/evaluations/.gitignore @@ -0,0 +1 @@ +*.pyc diff --git a/evaluations/README.adoc b/evaluations/README.adoc new file mode 100644 index 0000000..8547c49 --- /dev/null +++ b/evaluations/README.adoc @@ -0,0 +1,158 @@ += Semantic Anchor Evaluations +:toc: + +== Overview + +Multiple-choice evaluation framework for testing whether semantic anchors work across different LLMs. +See the link:../docs/anchor-evaluations.adoc[full concept document] for background and methodology. + +== Quick Start + +=== Prerequisites + +* Python 3.10+ +* `pyyaml` package: `pip install pyyaml` +* At least one of: +** Claude Code CLI (authenticated) +** OpenAI API key (`OPENAI_API_KEY` environment variable) +** Ollama running locally + +=== Running the Pilot + +[source,bash] +---- +cd website + +# Claude Sonnet (default, via CLI) +python3 evaluations/pilot.py + +# Claude Haiku +python3 evaluations/pilot.py --model claude-haiku + +# GPT-4o-mini (requires OPENAI_API_KEY) +python3 evaluations/pilot.py --model openai + +# Ollama (requires local server + model) +ollama serve & # start server if not running +ollama pull qwen3:4b # pull model (once) +python3 evaluations/pilot.py --model ollama # uses qwen3:4b by default +python3 evaluations/pilot.py --model ollama --ollama-model mistral # other model + +# Multiple models at once +python3 evaluations/pilot.py --model claude-cli claude-haiku openai + +# Dry run (show prompts without sending) +python3 evaluations/pilot.py --dry-run +---- + +=== Available Models + +[cols="1,1,2"] +|=== +|Flag |Model |Notes + +|`claude-cli` +|Claude Sonnet (via CLI) +|Default. Requires `claude` CLI authenticated. + +|`claude-haiku` +|Claude Haiku (via CLI) +|Smallest Claude model. Good lower-bound test. + +|`openai` +|GPT-4o-mini (via API) +|Requires `OPENAI_API_KEY`. + +|`claude` +|Claude Sonnet (via API) +|Requires `ANTHROPIC_API_KEY`. Alternative to CLI. + +|`ollama` +|Local model (via Ollama) +|Requires Ollama server on `localhost:11434`. Default: `qwen3:4b`, override with `--ollama-model`. +|=== + +== Directory Structure + +[source] +---- +evaluations/ +├── README.adoc # This file +├── pilot.py # Evaluation runner script +├── specs/ # Question specs (YAML) +│ ├── arc42.yaml +│ ├── docs-as-code.yaml +│ ├── mece.yaml +│ ├── tdd-london-school.yaml +│ └── timtowtdi.yaml +└── results/ # Raw results (JSON, timestamped) + └── pilot-*.json +---- + +== Question Spec Format + +Each anchor has a YAML file with multiple-choice questions: + +[source,yaml] +---- +anchor: tdd-london-school +tier: 3 + +questions: + recognition: # Level 1: Does the model identify the anchor? + question: | + Which of the following best describes "TDD, London School"? + options: + A: ... # Distractor (e.g., Chicago School description) + B: ... # Correct answer + C: ... # Distractor (e.g., BDD description) + D: ... # Distractor + correct: B + + application: # Level 2: Does it change behavior? + scenario: | + You are reviewing a PR. ... + anchor_prompt: "using TDD, London School principles" + paraphrase_prompt: "Write isolated tests for the service layer" + options: ... + correct: B + + consistency: # Level 4: Same answer across aliases/languages? + variants: + - 'Question with canonical name' + - 'Question with alias' + language_variant: 'Frage auf Deutsch' + options: ... + correct: B +---- + +== Scoring + +* Each question runs *4 times* with randomized option order (position bias mitigation) +* Score = percentage of correct answers across the 4 runs +* Response parsing: extracts first capital letter A–D from response +* Results saved as timestamped JSON in `results/` + +== Pilot Results (2026-03-24) + +[cols="1,1,1,1"] +|=== +|Model |Average |Best |Worst + +|Claude Sonnet 4.6 +|100% +|all 100% +|— + +|Claude Haiku 4.5 +|100% +|all 100% +|— + +|GPT-4o-mini +|81% +|Recognition: arc42, MECE, TIMTOWTDI (100%) +|TDD London School Recognition (25%) +|=== + +Key finding: *Position bias is real.* GPT-4o-mini recognizes "TDD, London School" only 25% of the time -- it picks the correct answer only when it happens to be in a favorable position. diff --git a/evaluations/fill-distractors.py b/evaluations/fill-distractors.py new file mode 100644 index 0000000..eeb002d --- /dev/null +++ b/evaluations/fill-distractors.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python3 +""" +Fill placeholder distractors in evaluation specs using Claude API. + +Reads specs with PLACEHOLDER_A/C/D options and asks Claude to generate +plausible but wrong distractors based on the anchor's domain. + +Usage: + python3 evaluations/fill-distractors.py # Fill all placeholders + python3 evaluations/fill-distractors.py --dry-run # Preview prompts + python3 evaluations/fill-distractors.py --anchor arc42 # Single anchor +""" + +import argparse +import json +import os +import sys +from pathlib import Path + +try: + import yaml +except ImportError: + print("PyYAML required: pip install pyyaml") + sys.exit(1) + +SPECS_DIR = Path(__file__).parent / "specs" + + +def needs_distractors(spec): + """Check if spec has placeholder distractors.""" + q = spec.get("questions", {}).get("recognition", {}) + options = q.get("options", {}) + return any("PLACEHOLDER" in str(v) for v in options.values()) + + +def generate_distractors(spec): + """Use Claude API to generate 3 plausible distractors.""" + try: + import anthropic + except ImportError: + print("anthropic package required: pip install anthropic") + sys.exit(1) + + q = spec["questions"]["recognition"] + correct = q["options"]["B"] + title = q["question"].strip().split('"')[1] if '"' in q["question"] else spec["anchor"] + related = q.get("_related", []) + proponents = q.get("_proponents", "") + + prompt = f"""Generate 3 plausible but WRONG multiple-choice distractors for this question: + +Question: Which of the following best describes "{title}"? +Correct answer: {correct} + +Requirements for distractors: +- Each distractor should be a one-sentence description of a DIFFERENT but related concept +- They must be wrong but sound plausible to someone unfamiliar with the topic +- All 4 options (correct + 3 distractors) should be similar in length +- Do NOT include the correct concept in any distractor +- Draw distractors from adjacent concepts in software engineering, architecture, or methodology +{f"- Related anchors for inspiration: {', '.join(related)}" if related else ""} +{f"- The correct answer is associated with: {proponents}" if proponents else ""} + +Return ONLY a JSON object with keys "A", "C", "D" containing the 3 distractor strings. No explanation.""" + + client = anthropic.Anthropic() + response = client.messages.create( + model="claude-sonnet-4-20250514", + max_tokens=300, + temperature=0.7, # some creativity for diverse distractors + messages=[{"role": "user", "content": prompt}], + ) + + text = response.content[0].text.strip() + # Parse JSON from response (might be wrapped in ```json ... ```) + if "```" in text: + text = text.split("```")[1] + if text.startswith("json"): + text = text[4:] + text = text.strip() + + return json.loads(text) + + +def main(): + parser = argparse.ArgumentParser(description="Fill placeholder distractors using Claude API") + parser.add_argument("--dry-run", action="store_true", help="Preview without writing") + parser.add_argument("--anchor", help="Process single anchor") + args = parser.parse_args() + + specs_to_fill = [] + for f in sorted(SPECS_DIR.glob("*.yaml")): + spec = yaml.safe_load(f.read_text(encoding="utf-8")) + if args.anchor and spec["anchor"] != args.anchor: + continue + if needs_distractors(spec): + specs_to_fill.append((f, spec)) + + print(f"Found {len(specs_to_fill)} specs needing distractors") + + for filepath, spec in specs_to_fill: + anchor_id = spec["anchor"] + print(f" {anchor_id}...", end=" ", flush=True) + + if args.dry_run: + print("(dry run)") + continue + + try: + distractors = generate_distractors(spec) + q = spec["questions"]["recognition"] + q["options"]["A"] = distractors["A"] + q["options"]["C"] = distractors["C"] + q["options"]["D"] = distractors["D"] + + # Remove helper notes + q.pop("_note", None) + q.pop("_related", None) + q.pop("_proponents", None) + q.pop("_also_known_as", None) + + with open(filepath, "w", encoding="utf-8") as fh: + yaml.dump(spec, fh, default_flow_style=False, allow_unicode=True, sort_keys=False) + print("OK") + + except Exception as e: + print(f"ERROR: {e}") + + print("\nDone. Review the generated distractors before running evaluations!") + + +if __name__ == "__main__": + main() diff --git a/evaluations/generate-l1-specs.py b/evaluations/generate-l1-specs.py new file mode 100644 index 0000000..198746d --- /dev/null +++ b/evaluations/generate-l1-specs.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python3 +""" +Generate Level 1 (Recognition) evaluation specs from .adoc anchor metadata. + +Reads each anchor's Core Concepts and Related Anchors to produce: +- A correct answer from the anchor's core description +- 3 plausible distractors from related/adjacent anchors + +Output: YAML specs in evaluations/specs/ (only recognition section). +Existing specs are preserved — only missing anchors are generated. + +Usage: + python3 evaluations/generate-l1-specs.py # Generate all Tier 3 + python3 evaluations/generate-l1-specs.py --dry-run # Preview without writing + python3 evaluations/generate-l1-specs.py --anchor arc42 # Single anchor +""" + +import argparse +import os +import re +import sys +from pathlib import Path + +try: + import yaml +except ImportError: + print("PyYAML required: pip install pyyaml") + sys.exit(1) + +ANCHORS_DIR = Path(__file__).parent.parent / "docs" / "anchors" +SPECS_DIR = Path(__file__).parent / "specs" + +# Skip these anchors (templates, meta, sub-patterns handled by umbrella) +SKIP_PREFIXES = ["_template", "gof-", "solid-", "test-double-"] +SKIP_EXACT = ["what-qualifies-as-a-semantic-anchor", "gof-design-patterns", + "solid-principles", "test-double-meszaros"] + + +def parse_adoc(filepath): + """Extract metadata from an .adoc anchor file.""" + content = filepath.read_text(encoding="utf-8") + lines = content.split("\n") + + result = { + "id": filepath.stem, + "title": "", + "tier": None, + "categories": "", + "related": [], + "proponents": "", + "also_known_as": "", + "core_concepts": [], + "when_to_use": [], + } + + # Parse attributes + for line in lines: + if line.startswith("= "): + result["title"] = line[2:].strip() + elif line.startswith(":tier:"): + result["tier"] = int(line.split(":tier:")[1].strip()) + elif line.startswith(":categories:"): + result["categories"] = line.split(":categories:")[1].strip() + elif line.startswith(":related:"): + result["related"] = [r.strip() for r in line.split(":related:")[1].strip().split(",")] + elif line.startswith(":proponents:"): + result["proponents"] = line.split(":proponents:")[1].strip() + + # Parse core concepts (definition list items) + in_core = False + in_when = False + for line in lines: + if "Core Concepts" in line: + in_core = True + in_when = False + continue + if "When to Use" in line: + in_core = False + in_when = True + continue + if "Related" in line or "Contrast" in line or "Technical" in line: + in_core = False + in_when = False + continue + + if in_core and "::" in line: + term = line.split("::")[0].strip() + desc = line.split("::", 1)[1].strip() if "::" in line else "" + if term and not term.startswith("[") and not term.startswith("Key Proponent"): + result["core_concepts"].append({"term": term, "desc": desc}) + elif in_when and line.strip().startswith("*"): + result["when_to_use"].append(line.strip().lstrip("* ")) + + # Also known as + for line in lines: + if "Also known as::" in line: + result["also_known_as"] = line.split("Also known as::")[1].strip() + + return result + + +def build_correct_answer(anchor): + """Build a one-sentence correct answer from core concepts.""" + concepts = anchor["core_concepts"][:4] + if not concepts: + return None + + parts = [] + for c in concepts: + if c["desc"]: + parts.append(c["desc"].rstrip(".")) + else: + parts.append(c["term"]) + + if len(parts) >= 2: + return f"{parts[0]}; {parts[1].lower()}" + return parts[0] + + +def generate_spec(anchor, all_anchors): + """Generate a YAML spec dict for one anchor.""" + correct = build_correct_answer(anchor) + if not correct: + return None + + spec = { + "anchor": anchor["id"], + "tier": anchor["tier"], + "questions": { + "recognition": { + "question": f'Which of the following best describes "{anchor["title"]}"?\n', + "options": { + "A": "PLACEHOLDER_A", + "B": correct, + "C": "PLACEHOLDER_C", + "D": "PLACEHOLDER_D", + }, + "correct": "B", + "_note": "REVIEW NEEDED: Distractors are placeholders. Replace A, C, D with plausible wrong answers from related anchors.", + "_related": anchor["related"], + "_proponents": anchor["proponents"], + "_also_known_as": anchor["also_known_as"], + } + } + } + return spec + + +def should_skip(anchor_id): + """Check if anchor should be skipped.""" + if anchor_id in SKIP_EXACT: + return True + for prefix in SKIP_PREFIXES: + if anchor_id.startswith(prefix) and anchor_id not in SKIP_EXACT: + return True + return False + + +def main(): + parser = argparse.ArgumentParser(description="Generate L1 evaluation specs from .adoc metadata") + parser.add_argument("--dry-run", action="store_true", help="Preview without writing files") + parser.add_argument("--anchor", help="Generate for a single anchor ID") + parser.add_argument("--force", action="store_true", help="Overwrite existing specs") + args = parser.parse_args() + + # Parse all anchors + all_anchors = {} + for f in sorted(ANCHORS_DIR.glob("*.adoc")): + if f.stem.endswith(".de") or f.stem == "_template": + continue + anchor = parse_adoc(f) + all_anchors[anchor["id"]] = anchor + + # Filter to Tier 3, skip sub-patterns + targets = [] + for aid, anchor in all_anchors.items(): + if args.anchor and aid != args.anchor: + continue + if anchor["tier"] != 3: + continue + if should_skip(aid): + continue + targets.append(anchor) + + print(f"Found {len(targets)} Tier 3 anchors to process") + + generated = 0 + skipped = 0 + for anchor in targets: + spec_file = SPECS_DIR / f"{anchor['id']}.yaml" + + if spec_file.exists() and not args.force: + skipped += 1 + continue + + spec = generate_spec(anchor, all_anchors) + if not spec: + print(f" SKIP {anchor['id']}: no core concepts found") + continue + + if args.dry_run: + print(f"\n--- {anchor['id']} ---") + print(yaml.dump(spec, default_flow_style=False, allow_unicode=True)) + else: + SPECS_DIR.mkdir(parents=True, exist_ok=True) + with open(spec_file, "w", encoding="utf-8") as fh: + yaml.dump(spec, fh, default_flow_style=False, allow_unicode=True, sort_keys=False) + print(f" WROTE {spec_file.name}") + generated += 1 + + print(f"\nDone: {generated} generated, {skipped} skipped (already exist)") + + +if __name__ == "__main__": + main() diff --git a/evaluations/generate-l2-specs.py b/evaluations/generate-l2-specs.py new file mode 100644 index 0000000..554b119 --- /dev/null +++ b/evaluations/generate-l2-specs.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 +""" +Generate Level 2 (Application) questions for evaluation specs using Claude API. + +For each anchor that has a recognition question but no application question, +generates a realistic scenario with anchor prompt, paraphrase, and MC options. + +Usage: + python3 evaluations/generate-l2-specs.py # Fill all missing L2 + python3 evaluations/generate-l2-specs.py --dry-run # Preview + python3 evaluations/generate-l2-specs.py --anchor arc42 # Single anchor +""" + +import argparse +import json +import os +import sys +from pathlib import Path + +try: + import yaml +except ImportError: + print("PyYAML required: pip install pyyaml") + sys.exit(1) + +SPECS_DIR = Path(__file__).parent / "specs" +ANCHORS_DIR = Path(__file__).parent.parent / "docs" / "anchors" + +SKIP_ANCHORS = {"sanity-check", "negative-control"} + + +def load_anchor_context(anchor_id): + """Load anchor .adoc file for context.""" + adoc = ANCHORS_DIR / f"{anchor_id}.adoc" + if adoc.exists(): + return adoc.read_text(encoding="utf-8")[:2000] + return "" + + +def needs_application(spec): + """Check if spec is missing an application question.""" + return "application" not in spec.get("questions", {}) + + +def generate_application(spec): + """Use Claude API to generate an L2 Application question.""" + try: + import anthropic + except ImportError: + print("anthropic package required: pip install anthropic") + sys.exit(1) + + anchor_id = spec["anchor"] + title = spec["questions"]["recognition"]["question"].split('"')[1] if '"' in spec["questions"]["recognition"]["question"] else anchor_id + context = load_anchor_context(anchor_id) + + prompt = f"""Generate a Level 2 Application multiple-choice question for the semantic anchor "{title}". + +The question tests whether an LLM can APPLY the methodology, not just describe it. + +Anchor definition (from .adoc file): +{context} + +Requirements: +1. Write a realistic SCENARIO (2-3 sentences) describing a concrete software engineering situation where this anchor applies. +2. Write an ANCHOR_PROMPT — a short phrase like "using {title}" that would be added to the scenario. +3. Write a PARAPHRASE_PROMPT — describes the GOAL without naming the methodology or hinting at the correct answer. Must be fair: not too specific (leaks answer) and not too vague. +4. Write 4 OPTIONS (A, B, C, D) — one correct answer that reflects the methodology, three plausible alternatives. +5. All options should be similar in length. +6. The correct answer should reflect what a practitioner of this methodology would recommend. + +Return ONLY a JSON object with this exact structure: +{{ + "scenario": "...", + "anchor_prompt": "using {title}", + "paraphrase_prompt": "...", + "options": {{ + "A": "...", + "B": "...", + "C": "...", + "D": "..." + }}, + "correct": "B" +}} + +Make B the correct answer. No explanation outside the JSON.""" + + client = anthropic.Anthropic() + response = client.messages.create( + model="claude-sonnet-4-20250514", + max_tokens=500, + temperature=0.7, + messages=[{"role": "user", "content": prompt}], + ) + + text = response.content[0].text.strip() + if "```" in text: + text = text.split("```")[1] + if text.startswith("json"): + text = text[4:] + text = text.strip() + + return json.loads(text) + + +def main(): + parser = argparse.ArgumentParser(description="Generate L2 Application questions using Claude API") + parser.add_argument("--dry-run", action="store_true") + parser.add_argument("--anchor", help="Process single anchor") + args = parser.parse_args() + + specs_to_fill = [] + for f in sorted(SPECS_DIR.glob("*.yaml")): + spec = yaml.safe_load(f.read_text(encoding="utf-8")) + if spec["anchor"] in SKIP_ANCHORS: + continue + if args.anchor and spec["anchor"] != args.anchor: + continue + if needs_application(spec): + specs_to_fill.append((f, spec)) + + print(f"Found {len(specs_to_fill)} specs needing L2 Application questions") + + for filepath, spec in specs_to_fill: + anchor_id = spec["anchor"] + print(f" {anchor_id}...", end=" ", flush=True) + + if args.dry_run: + print("(dry run)") + continue + + try: + app = generate_application(spec) + spec["questions"]["application"] = app + + with open(filepath, "w", encoding="utf-8") as fh: + yaml.dump(spec, fh, default_flow_style=False, allow_unicode=True, sort_keys=False) + print("OK") + + except Exception as e: + print(f"ERROR: {e}") + + print("\nDone. Review the generated scenarios before running evaluations!") + + +if __name__ == "__main__": + main() diff --git a/evaluations/generate-report.py b/evaluations/generate-report.py new file mode 100644 index 0000000..a6be43e --- /dev/null +++ b/evaluations/generate-report.py @@ -0,0 +1,294 @@ +#!/usr/bin/env python3 +""" +Generate an HTML report from evaluation results. + +Reads all result JSON files and produces an interactive HTML report with: +- Summary table (model × average score) +- Heatmap (anchor × model) +- Detail sections per anchor with raw responses +- Controls (sanity check, negative control) shown separately + +Usage: + python3 evaluations/generate-report.py + python3 evaluations/generate-report.py --output evaluations/report.html +""" + +import argparse +from html import escape as h +import json +from collections import defaultdict +from pathlib import Path + +RESULTS_DIR = Path(__file__).parent / "results" +SPECS_DIR = Path(__file__).parent / "specs" + +# Models to include and display order +MODEL_DISPLAY = { + "claude": "Claude Sonnet", + "claude-cli": "Claude Sonnet (CLI)", + "claude-haiku": "Claude Haiku", + "openai": "GPT-4o", + "mistral": "Mistral Large", + "ollama": "Ollama (local)", +} + +CONTROL_ANCHORS = {"sanity-check", "negative-control"} + + +def load_best_results(): + """Load the latest result with the most questions per model.""" + results = {} + for f in sorted(RESULTS_DIR.glob("pilot-*.json")): + d = json.load(open(f, encoding="utf-8")) + for m, r in d["models"].items(): + if m not in results or len(r) >= len(results[m]["data"]): + results[m] = { + "data": r, + "file": f.name, + "config": d.get("config", {}), + "duration": d.get("duration_seconds", 0), + "timestamp": d.get("timestamp", ""), + } + return results + + +def score_color(score): + if score >= 0.8: + return "#22c55e" # green + elif score >= 0.5: + return "#eab308" # yellow + else: + return "#ef4444" # red + + +def score_bg(score): + if score >= 0.8: + return "#dcfce7" + elif score >= 0.5: + return "#fef9c3" + else: + return "#fee2e2" + + +def generate_html(results, output_path): + # Collect all anchors and questions + all_questions = defaultdict(dict) # anchor/label -> {model: score} + model_names = [] + + # Prefer full runs (75 questions) over pilot runs + for m in ["claude", "openai", "mistral"]: + if m in results and len(results[m]["data"]) >= 60: + model_names.append(m) + + # Add smaller runs if no full run exists + for m in ["claude-cli", "claude-haiku", "ollama"]: + if m in results and m not in model_names: + model_names.append(m) + + for m in model_names: + for q in results[m]["data"]: + label = q["label"] + all_questions[label][m] = q["score"] + + # Separate controls from anchors + anchor_questions = {k: v for k, v in all_questions.items() + if not any(k.startswith(c) for c in CONTROL_ANCHORS)} + control_questions = {k: v for k, v in all_questions.items() + if any(k.startswith(c) for c in CONTROL_ANCHORS)} + + # Group by anchor + anchor_groups = defaultdict(list) + for label in sorted(anchor_questions.keys()): + anchor_id = label.split("/")[0] + anchor_groups[anchor_id].append(label) + + # Model averages (excluding controls) + model_avgs = {} + for m in model_names: + scores = [anchor_questions[label].get(m) for label in anchor_questions + if anchor_questions[label].get(m) is not None] + model_avgs[m] = sum(scores) / len(scores) if scores else 0 + + html = f""" + + + + +Semantic Anchor Evaluation Report + + + +

Semantic Anchor Evaluation Report

+

Multiple-choice recognition test across {len(model_names)} LLMs — {len(anchor_questions)} questions, {len(anchor_groups)} anchors

+ +
+
≥80%
+
50–79%
+
<50%
+
+ +

Model Summary

+
+""" + + for m in model_names: + avg = model_avgs.get(m, 0) + display = MODEL_DISPLAY.get(m, m) + n = len([1 for l in anchor_questions if anchor_questions[l].get(m) is not None]) + info = results[m] + html += f"""
+
{display}
+
{avg:.0%}
+
{n} questions · {info['file']}
+
+""" + + html += """
+ +

Heatmap: Anchor × Model

+ + + +""" + + for m in model_names: + html += f" \n" + html += "\n\n" + + for anchor_id in sorted(anchor_groups.keys()): + labels = anchor_groups[anchor_id] + # Anchor group row with average + anchor_scores = {} + for m in model_names: + scores = [anchor_questions[l].get(m) for l in labels if anchor_questions[l].get(m) is not None] + anchor_scores[m] = sum(scores) / len(scores) if scores else None + + html += f'' + for m in model_names: + s = anchor_scores.get(m) + if s is not None: + bg = score_bg(s) + text = "✓" if s == 1.0 else f"{s:.0%}" + html += f'' + else: + html += '' + html += "\n" + + # Individual question rows (only show if there are multiple or if score < 100%) + if len(labels) > 1: + for label in labels: + short = label.split("/", 1)[1] if "/" in label else label + html += f'' + for m in model_names: + s = anchor_questions[label].get(m) + if s is not None: + bg = score_bg(s) + text = "✓" if s == 1.0 else f"{s:.0%}" + html += f'' + else: + html += '' + html += "\n" + + html += "
Anchor / Question{MODEL_DISPLAY.get(m, m)}
{h(anchor_id)}{text}
{h(short)}{text}
\n" + + # Controls section + if control_questions: + html += '

Control Questions

\n\n' + for m in model_names: + html += f"" + html += "\n\n" + for label in sorted(control_questions.keys()): + short = label.replace("/recognition", "") + html += f"" + for m in model_names: + s = control_questions[label].get(m) + if s is not None: + bg = score_bg(s) if "sanity" not in label else ("#dcfce7" if s == 0 else "#fee2e2") + text = f"{s:.0%}" + html += f'' + else: + html += '' + html += "\n" + html += "
Control{MODEL_DISPLAY.get(m, m)}
{short}{text}
\n" + + # Failures detail + html += "

Failures Detail

\n" + for m in model_names: + fails = [(q["label"], q["score"]) for q in results[m]["data"] + if q["score"] < 1.0 and not any(q["label"].startswith(c) for c in CONTROL_ANCHORS)] + if not fails: + html += f"

{MODEL_DISPLAY.get(m, m)}: no failures

\n" + else: + html += f'

{MODEL_DISPLAY.get(m, m)}: {len(fails)} failures

\n
\n' + for label, score in sorted(fails): + html += f'
{h(label)}{score:.0%}
\n' + html += "
\n" + + # Metadata + html += """ +
+

Run Metadata

+
+""" + for m in model_names: + info = results[m] + dur = info["duration"] + html += f"
{MODEL_DISPLAY.get(m, m)}:
{info['file']} · {int(dur//60)}m {int(dur%60)}s · {info['timestamp'][:19]}

" + + html += """ +
+

Generated by evaluations/generate-report.py · Position bias mitigation: 4 permutations per question · Scoring: deterministic MC (no LLM judge)

+
+ + +""" + + output_path.write_text(html, encoding="utf-8") + print(f"Report written to {output_path}") + + +def main(): + parser = argparse.ArgumentParser(description="Generate HTML evaluation report") + parser.add_argument("--output", default="evaluations/report.html", + help="Output HTML file (default: evaluations/report.html)") + args = parser.parse_args() + + results = load_best_results() + print(f"Loaded results for {len(results)} models") + for m, info in results.items(): + print(f" {m}: {len(info['data'])} questions from {info['file']}") + + generate_html(results, Path(args.output)) + + +if __name__ == "__main__": + main() diff --git a/evaluations/pilot.py b/evaluations/pilot.py new file mode 100644 index 0000000..c921ad0 --- /dev/null +++ b/evaluations/pilot.py @@ -0,0 +1,502 @@ +#!/usr/bin/env python3 +""" +Pilot evaluation runner for semantic anchor multiple-choice tests. +Reads YAML specs, sends questions to LLMs, scores responses. + +Usage: + python3 pilot.py --model claude # Claude Sonnet via Anthropic API + python3 pilot.py --model ollama # Local model via Ollama (OpenAI-compatible) + python3 pilot.py --model claude ollama # Both + python3 pilot.py --dry-run # Show prompts without sending +""" + +import argparse +import json +import os +import random +import sys +import time +from datetime import datetime, timezone +from pathlib import Path + +try: + import yaml +except ImportError: + print("PyYAML required: pip install pyyaml") + sys.exit(1) + +SPECS_DIR = Path(__file__).parent / "specs" +RESULTS_DIR = Path(__file__).parent / "results" +POSITION_PERMUTATIONS = [ + [0, 1, 2, 3], # A B C D (original) + [1, 2, 3, 0], # B C D A + [2, 3, 0, 1], # C D A B + [3, 0, 1, 2], # D A B C +] +LETTERS = ["A", "B", "C", "D"] + + +def load_specs(): + specs = [] + for f in sorted(SPECS_DIR.glob("*.yaml")): + with open(f, encoding="utf-8") as fh: + specs.append(yaml.safe_load(fh)) + return specs + + +def build_prompt(question_text, options, permutation): + """Build a prompt with options in the given permutation order.""" + lines = [question_text.strip(), ""] + for i, perm_idx in enumerate(permutation): + letter = LETTERS[i] + option_text = options[LETTERS[perm_idx]] + lines.append(f"{letter}) {option_text}") + lines.append("") + lines.append("Answer with the letter only.") + return "\n".join(lines) + + +def correct_letter_for_permutation(original_correct, permutation): + """Find which letter the original correct answer maps to in this permutation. + Returns 'X' for sanity checks (no correct answer exists).""" + if original_correct == "X": + return "X" + original_idx = LETTERS.index(original_correct) + for i, perm_idx in enumerate(permutation): + if perm_idx == original_idx: + return LETTERS[i] + return None + + +def parse_response(text): + """Extract the first capital letter A-D from the response. + Strips ... blocks (used by reasoning models like qwen3).""" + import re + # Remove thinking blocks (qwen3, DeepSeek R1, etc.) + cleaned = re.sub(r'.*?', '', text, flags=re.DOTALL).strip() + # If nothing left after stripping, fall back to original + if not cleaned: + cleaned = text.strip() + # Try to find a standalone answer letter (e.g., "B", "B)", "**B**", "b") + # First: look for a line that is just a letter (strongest signal) + for line in cleaned.split('\n'): + line = line.strip().strip('*').strip('.').strip(')').strip() + if line.upper() in ("A", "B", "C", "D"): + return line.upper() + # Fallback: first capital A-D in the text + for char in cleaned: + if char in "ABCD": + return char + return None + + +# Global temperature — set via --temperature flag +TEMPERATURE = 0.0 + + +def set_temperature(t): + global TEMPERATURE + TEMPERATURE = t + + +def call_claude_api(prompt, model="claude-sonnet-4-20250514"): + """Send prompt to Claude via Anthropic API.""" + try: + import anthropic + except ImportError: + print("anthropic package required: pip install anthropic") + sys.exit(1) + + client = anthropic.Anthropic() + response = client.messages.create( + model=model, + max_tokens=10, + temperature=TEMPERATURE, + messages=[{"role": "user", "content": prompt}], + ) + return response.content[0].text, model + + +def call_claude_cli(prompt, model="claude-cli"): + """Send prompt to Claude Sonnet via claude -p CLI. + Note: temperature cannot be controlled via CLI.""" + import subprocess + result = subprocess.run( + ["claude", "-p", prompt], + capture_output=True, text=True, timeout=60, + ) + if result.returncode != 0: + return f"ERROR: {result.stderr.strip()}", model + return result.stdout.strip(), model + + +def call_claude_haiku(prompt, model="claude-haiku"): + """Send prompt to Claude Haiku via claude -p CLI. + Note: temperature cannot be controlled via CLI.""" + import subprocess + result = subprocess.run( + ["claude", "-p", prompt, "--model", "haiku"], + capture_output=True, text=True, timeout=60, + ) + if result.returncode != 0: + return f"ERROR: {result.stderr.strip()}", model + return result.stdout.strip(), model + + +def make_openai_caller(openai_model): + """Create an OpenAI caller for a specific model.""" + def call_openai(prompt, model=openai_model): + try: + import openai + except ImportError: + print("openai package required: pip install openai") + sys.exit(1) + + client = openai.OpenAI() + # GPT-5+ and reasoning models require different parameters + is_new_api = any(x in model for x in ("gpt-5", "o3", "o4")) + kwargs = {"model": model, "messages": [{"role": "user", "content": prompt}]} + if is_new_api: + kwargs["max_completion_tokens"] = 2048 + # GPT-5 only supports temperature=1 + else: + kwargs["max_tokens"] = 10 + kwargs["temperature"] = TEMPERATURE + response = client.chat.completions.create(**kwargs) + return response.choices[0].message.content.strip(), model + return call_openai + + +def make_mistral_caller(mistral_model): + """Create a Mistral caller via OpenAI-compatible API.""" + def call_mistral(prompt, model=mistral_model): + try: + import openai + except ImportError: + print("openai package required: pip install openai") + sys.exit(1) + + client = openai.OpenAI( + base_url="https://api.mistral.ai/v1", + api_key=os.environ.get("MISTRAL_API_KEY", ""), + ) + response = client.chat.completions.create( + model=model, + max_tokens=10, + temperature=TEMPERATURE, + messages=[{"role": "user", "content": prompt}], + ) + return response.choices[0].message.content.strip(), model + return call_mistral + + +def make_deepseek_caller(deepseek_model): + """Create a DeepSeek caller via OpenAI-compatible API.""" + def call_deepseek(prompt, model=deepseek_model): + try: + import openai + except ImportError: + print("openai package required: pip install openai") + sys.exit(1) + + client = openai.OpenAI( + base_url="https://api.deepseek.com", + api_key=os.environ.get("DEEPSEEK_API_KEY", ""), + ) + response = client.chat.completions.create( + model=model, + max_tokens=10, + temperature=TEMPERATURE, + messages=[{"role": "user", "content": prompt}], + ) + return response.choices[0].message.content.strip(), model + return call_deepseek + + +def make_ollama_caller(ollama_model, no_think=False, base_url="http://localhost:11434"): + """Create an Ollama caller for a specific model.""" + def call_ollama(prompt, model=ollama_model): + import urllib.request + + body = { + "model": model, + "messages": [{"role": "user", "content": prompt}], + "stream": False, + "options": {"temperature": TEMPERATURE}, + } + if no_think: + body["think"] = False + + data = json.dumps(body).encode("utf-8") + req = urllib.request.Request( + f"{base_url}/api/chat", + data=data, + headers={"Content-Type": "application/json"}, + ) + with urllib.request.urlopen(req, timeout=300) as resp: + result = json.loads(resp.read()) + + content = result.get("message", {}).get("content", "") + return content, f"ollama/{model}" + return call_ollama + + +def run_question(question_data, call_fn, label, context="", verbose=False): + """Run a single question 4x with randomized positions. Returns results.""" + question_text = question_data["question"] + if context: + question_text = f"{context}\n{question_text}" + options = question_data["options"] + original_correct = question_data["correct"] + results = [] + for i, perm in enumerate(POSITION_PERMUTATIONS): + prompt = build_prompt(question_text, options, perm) + expected = correct_letter_for_permutation(original_correct, perm) + + try: + response_text, model_id = call_fn(prompt) + except Exception as e: + response_text = f"ERROR: {e}" + if verbose: + print(f"\n [ERROR] {e}") + + answer = parse_response(response_text) + correct = answer == expected + + if verbose and i == 0: # show first permutation only + print(f"\n [RAW] expected={expected} parsed={answer} response={repr(response_text[:200])}") + + results.append({ + "permutation": [LETTERS[p] for p in perm], + "expected": expected, + "answer": answer, + "correct": correct, + "raw_response": response_text.strip()[:500], + }) + time.sleep(0.5) # rate limiting + + score = sum(1 for r in results if r["correct"]) / len(results) + return { + "label": label, + "score": score, + "results": results, + } + + +def save_results(all_results, out_file): + """Save results incrementally after each question.""" + RESULTS_DIR.mkdir(parents=True, exist_ok=True) + with open(out_file, "w", encoding="utf-8") as fh: + json.dump(all_results, fh, indent=2, ensure_ascii=False) + + +def run_pilot(models, dry_run=False, verbose=False, ollama_model="qwen3:4b", no_think=False, + ollama_url="http://localhost:11434", openai_model="gpt-4o-mini", + mistral_model="mistral-large-latest", deepseek_model="deepseek-chat"): + start_time = time.time() + specs = load_specs() + print(f"Loaded {len(specs)} anchor specs") + print(f"Models: {', '.join(models)}") + print(f"Temperature: {TEMPERATURE}") + if "openai" in models: + print(f"OpenAI model: {openai_model}") + if "mistral" in models: + print(f"Mistral model: {mistral_model}") + if "deepseek" in models: + print(f"DeepSeek model: {deepseek_model}") + if "ollama" in models: + print(f"Ollama model: {ollama_model}") + print(f"Ollama URL: {ollama_url}") + print(f"No-think: {no_think}") + print(f"Dry run: {dry_run}") + print() + + ts = datetime.now().strftime("%Y%m%d-%H%M%S") + out_file = RESULTS_DIR / f"pilot-{ts}.json" + + all_results = { + "timestamp": datetime.now(timezone.utc).isoformat(), + "config": { + "models": models, + "openai_model": openai_model if "openai" in models else None, + "mistral_model": mistral_model if "mistral" in models else None, + "deepseek_model": deepseek_model if "deepseek" in models else None, + "ollama_model": ollama_model if "ollama" in models else None, + "ollama_url": ollama_url if "ollama" in models else None, + "no_think": no_think if "ollama" in models else None, + "temperature": TEMPERATURE, + }, + "models": {}, + } + + for model_name in models: + if model_name == "claude": + call_fn = call_claude_api + elif model_name == "claude-cli": + call_fn = call_claude_cli + elif model_name == "claude-haiku": + call_fn = call_claude_haiku + elif model_name == "openai": + call_fn = make_openai_caller(openai_model) + elif model_name == "mistral": + call_fn = make_mistral_caller(mistral_model) + elif model_name == "deepseek": + call_fn = make_deepseek_caller(deepseek_model) + elif model_name == "ollama": + call_fn = make_ollama_caller(ollama_model, no_think=no_think, base_url=ollama_url) + else: + print(f"Unknown model: {model_name}") + continue + + # Count total questions for progress display + total_q = 0 + for spec in specs: + questions = spec.get("questions", {}) + if "recognition" in questions: total_q += 1 + if "application" in questions: total_q += 2 # anchor + paraphrase + if "consistency" in questions: + cons = questions["consistency"] + total_q += len(cons.get("variants", [])) + if cons.get("language_variant"): total_q += 1 + + print(f"=== {model_name.upper()} ({total_q} questions) ===") + model_results = [] + all_results["models"][model_name] = model_results + current_q = [0] + + def append_and_save(r): + model_results.append(r) + current_q[0] += 1 + if not dry_run: + save_results(all_results, out_file) + + for spec in specs: + anchor = spec["anchor"] + questions = spec.get("questions", {}) + + # Level 1: Recognition + if "recognition" in questions: + q = questions["recognition"] + if dry_run: + prompt = build_prompt(q["question"], q["options"], POSITION_PERMUTATIONS[0]) + print(f"\n[DRY RUN] {anchor} / recognition:") + print(prompt) + else: + print(f" [{current_q[0]+1}/{total_q}] {anchor} / recognition...", end=" ", flush=True) + result = run_question(q, call_fn, f"{anchor}/recognition", verbose=verbose) + print(f"{result['score']:.0%}") + append_and_save(result) + + # Level 2: Application (anchor variant) + if "application" in questions: + app = questions["application"] + anchor_q = { + "question": f"{app['scenario'].strip()}\n{app['anchor_prompt']}", + "options": app["options"], + "correct": app["correct"], + } + para_q = { + "question": f"{app['scenario'].strip()}\n{app['paraphrase_prompt']}", + "options": app["options"], + "correct": app["correct"], + } + if dry_run: + prompt = build_prompt(anchor_q["question"], anchor_q["options"], POSITION_PERMUTATIONS[0]) + print(f"\n[DRY RUN] {anchor} / application (anchor):") + print(prompt) + else: + print(f" [{current_q[0]+1}/{total_q}] {anchor} / application (anchor)...", end=" ", flush=True) + result_a = run_question(anchor_q, call_fn, f"{anchor}/application-anchor", verbose=verbose) + print(f"{result_a['score']:.0%}") + append_and_save(result_a) + + print(f" [{current_q[0]+1}/{total_q}] {anchor} / application (paraphrase)...", end=" ", flush=True) + result_p = run_question(para_q, call_fn, f"{anchor}/application-paraphrase", verbose=verbose) + print(f"{result_p['score']:.0%}") + append_and_save(result_p) + + # Level 4: Consistency + if "consistency" in questions: + cons = questions["consistency"] + variants = cons.get("variants", []) + lang = cons.get("language_variant") + if lang: + variants = variants + [lang] + + for i, variant in enumerate(variants): + variant_q = { + "question": variant, + "options": cons["options"], + "correct": cons["correct"], + } + variant_label = f"variant-{i+1}" if i < len(cons.get("variants", [])) else "language" + if dry_run: + prompt = build_prompt(variant_q["question"], variant_q["options"], POSITION_PERMUTATIONS[0]) + print(f"\n[DRY RUN] {anchor} / consistency ({variant_label}):") + print(prompt) + else: + print(f" [{current_q[0]+1}/{total_q}] {anchor} / consistency ({variant_label})...", end=" ", flush=True) + result = run_question(variant_q, call_fn, f"{anchor}/consistency-{variant_label}", verbose=verbose) + print(f"{result['score']:.0%}") + append_and_save(result) + + all_results["models"][model_name] = model_results + + elapsed = time.time() - start_time + all_results["duration_seconds"] = round(elapsed, 1) + + if not dry_run: + save_results(all_results, out_file) + print(f"\nResults saved to {out_file}") + + # Summary + print("\n=== SUMMARY ===") + print(f"Models: {', '.join(models)}") + print(f"Temperature: {TEMPERATURE}") + if "openai" in models: + print(f"OpenAI: {openai_model}") + if "mistral" in models: + print(f"Mistral: {mistral_model}") + if "deepseek" in models: + print(f"DeepSeek: {deepseek_model}") + if "ollama" in models: + print(f"Ollama: {ollama_model} @ {ollama_url} (no-think={no_think})") + minutes, seconds = divmod(int(elapsed), 60) + print(f"Duration: {minutes}m {seconds}s") + print() + for model_name, results in all_results["models"].items(): + scores = [r["score"] for r in results] + avg = sum(scores) / len(scores) if scores else 0 + print(f"{model_name}: {avg:.0%} average ({len(scores)} questions)") + for r in results: + status = "✓" if r["score"] == 1.0 else f"{r['score']:.0%}" + print(f" {r['label']}: {status}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Pilot evaluation runner") + parser.add_argument("--model", nargs="+", default=["claude-cli"], + choices=["claude", "claude-cli", "claude-haiku", "openai", "mistral", "deepseek", "ollama"], + help="Models to evaluate (default: claude-cli)") + parser.add_argument("--openai-model", default="gpt-4o-mini", + help="OpenAI model name (default: gpt-4o-mini). Try: gpt-5, gpt-5-mini, gpt-4o") + parser.add_argument("--mistral-model", default="mistral-large-latest", + help="Mistral model name (default: mistral-large-latest)") + parser.add_argument("--deepseek-model", default="deepseek-chat", + help="DeepSeek model name (default: deepseek-chat)") + parser.add_argument("--ollama-model", default="qwen3:4b", + help="Ollama model name (default: qwen3:4b)") + parser.add_argument("--ollama-url", default="http://localhost:11434", + help="Ollama API base URL (default: http://localhost:11434)") + parser.add_argument("--temperature", type=float, default=0.0, + help="Sampling temperature (default: 0.0). Note: claude-cli/claude-haiku ignore this.") + parser.add_argument("--no-think", action="store_true", + help="Disable reasoning/thinking for Ollama models (faster, fewer tokens)") + parser.add_argument("--dry-run", action="store_true", + help="Show prompts without sending") + parser.add_argument("--verbose", action="store_true", + help="Print raw responses for debugging") + args = parser.parse_args() + set_temperature(args.temperature) + run_pilot(args.model, args.dry_run, args.verbose, args.ollama_model, args.no_think, + args.ollama_url, args.openai_model, args.mistral_model, args.deepseek_model) diff --git a/evaluations/report.html b/evaluations/report.html new file mode 100644 index 0000000..0fd9fb2 --- /dev/null +++ b/evaluations/report.html @@ -0,0 +1,388 @@ + + + + + +Semantic Anchor Evaluation Report + + + +

Semantic Anchor Evaluation Report

+

Multiple-choice recognition test across 3 LLMs — 191 questions, 61 anchors

+ +
+
≥80%
+
50–79%
+
<50%
+
+ +

Model Summary

+
+
+
Claude Sonnet
+
99%
+
191 questions · pilot-20260324-174404.json
+
+
+
GPT-4o
+
98%
+
191 questions · pilot-20260324-192413.json
+
+
+
Mistral Large
+
96%
+
191 questions · pilot-20260324-190600.json
+
+
+ +

Heatmap: Anchor × Model

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Anchor / QuestionClaude SonnetGPT-4oMistral Large
adr-according-to-nygard92%
application-anchor
application-paraphrase
recognition75%
arc42
application-anchor
application-paraphrase
consistency-language
consistency-variant-1
consistency-variant-2
consistency-variant-3
recognition
atam
application-anchor
application-paraphrase
recognition
bdd-given-when-then83%
application-anchor
application-paraphrase50%
recognition
bem-methodology
application-anchor
application-paraphrase
recognition
bluf
application-anchor
application-paraphrase
recognition
c4-diagrams
application-anchor
application-paraphrase
recognition
chain-of-thought
application-anchor
application-paraphrase
recognition
clean-architecture
application-anchor
application-paraphrase
recognition
control-chart-shewhart92%
application-anchor
application-paraphrase75%
recognition
conventional-commits
application-anchor
application-paraphrase
recognition
cqrs
application-anchor
application-paraphrase
recognition
cynefin-framework
application-anchor
application-paraphrase
recognition
definition-of-done
application-anchor
application-paraphrase
recognition
devils-advocate
application-anchor
application-paraphrase
recognition
diataxis-framework
application-anchor
application-paraphrase
recognition
docs-as-code
application-anchor
application-paraphrase
recognition
domain-driven-design
application-anchor
application-paraphrase
recognition
ears-requirements92%83%
application-anchor
application-paraphrase75%75%
recognition75%
event-driven-architecture
application-anchor
application-paraphrase
recognition
fagan-inspection
application-anchor
application-paraphrase
recognition
feynman-technique67%67%92%
application-anchor
application-paraphrase0%0%75%
recognition
five-whys
application-anchor
application-paraphrase
recognition
fowler-patterns
application-anchor
application-paraphrase
recognition
gherkin
application-anchor
application-paraphrase
recognition
github-flow92%92%92%
application-anchor
application-paraphrase75%75%75%
recognition
gutes-deutsch-wolf-schneider
application-anchor
application-paraphrase
recognition
hexagonal-architecture
application-anchor
application-paraphrase
recognition
iec-61508-sil-levels92%83%
application-anchor50%
application-paraphrase75%
recognition
impact-mapping
application-anchor
application-paraphrase
recognition
invest
application-anchor
application-paraphrase
recognition
iso-2501083%
application-anchor75%
application-paraphrase75%
recognition
jobs-to-be-done
application-anchor
application-paraphrase
recognition
lasr92%75%
application-anchor
application-paraphrase
recognition75%25%
linddun
application-anchor
application-paraphrase
recognition
llm-evaluations
application-anchor
application-paraphrase
recognition
madr
application-anchor
application-paraphrase
recognition
mece
application-anchor
application-paraphrase
recognition
morphological-box
application-anchor
application-paraphrase
recognition
moscow92%75%
application-anchor
application-paraphrase75%25%
recognition
mutation-testing
application-anchor
application-paraphrase
recognition
nelson-rules
application-anchor
application-paraphrase
recognition
owasp-top-10
application-anchor
application-paraphrase
recognition
plain-english-strunk-white
application-anchor
application-paraphrase
recognition
prd92%67%
application-anchor
application-paraphrase
recognition75%0%
problem-space-nvc83%
application-anchor75%
application-paraphrase75%
recognition
property-based-testing83%
application-anchor75%
application-paraphrase75%
recognition
pyramid-principle
application-anchor
application-paraphrase
recognition
semantic-versioning75%
application-anchor50%
application-paraphrase75%
recognition
socratic-method
application-anchor
application-paraphrase
recognition
sota
application-anchor
application-paraphrase
recognition
spc
application-anchor
application-paraphrase
recognition
stride
application-anchor
application-paraphrase
recognition
swot
application-anchor
application-paraphrase
recognition
tdd-chicago-school92%
application-anchor
application-paraphrase
recognition75%
tdd-london-school89%
application-anchor
application-paraphrase
consistency-language
consistency-variant-1
consistency-variant-2
consistency-variant-350%
recognition75%
testing-pyramid
application-anchor
application-paraphrase
recognition
timtowtdi
application-anchor
application-paraphrase
recognition
todotxt-flavoured-markdown83%
application-anchor
application-paraphrase
recognition50%
user-story-mapping
application-anchor
application-paraphrase
recognition
wardley-mapping
application-anchor
application-paraphrase
recognition
+

Control Questions

+ + + + + +
ControlClaude SonnetGPT-4oMistral Large
negative-control100%100%75%
sanity-check0%0%0%
+

Failures Detail

+

Claude Sonnet: 2 failures

+
+
feynman-technique/application-paraphrase0%
+
github-flow/application-paraphrase75%
+
+

GPT-4o: 13 failures

+
+
control-chart-shewhart/application-paraphrase75%
+
ears-requirements/application-paraphrase75%
+
feynman-technique/application-paraphrase0%
+
github-flow/application-paraphrase75%
+
iec-61508-sil-levels/application-paraphrase75%
+
lasr/recognition75%
+
moscow/application-paraphrase75%
+
prd/recognition75%
+
property-based-testing/application-anchor75%
+
property-based-testing/application-paraphrase75%
+
tdd-chicago-school/recognition75%
+
tdd-london-school/consistency-variant-350%
+
tdd-london-school/recognition75%
+
+

Mistral Large: 17 failures

+
+
adr-according-to-nygard/recognition75%
+
bdd-given-when-then/application-paraphrase50%
+
ears-requirements/application-paraphrase75%
+
ears-requirements/recognition75%
+
feynman-technique/application-paraphrase75%
+
github-flow/application-paraphrase75%
+
iec-61508-sil-levels/application-anchor50%
+
iso-25010/application-anchor75%
+
iso-25010/application-paraphrase75%
+
lasr/recognition25%
+
moscow/application-paraphrase25%
+
prd/recognition0%
+
problem-space-nvc/application-anchor75%
+
problem-space-nvc/application-paraphrase75%
+
semantic-versioning/application-anchor50%
+
semantic-versioning/application-paraphrase75%
+
todotxt-flavoured-markdown/recognition50%
+
+ +
+

Run Metadata

+
+
Claude Sonnet:
pilot-20260324-174404.json · 81m 2s · 2026-03-24T17:44:04

GPT-4o:
pilot-20260324-192413.json · 15m 38s · 2026-03-24T19:24:13

Mistral Large:
pilot-20260324-190600.json · 16m 58s · 2026-03-24T19:06:00

+
+

Generated by evaluations/generate-report.py · Position bias mitigation: 4 permutations per question · Scoring: deterministic MC (no LLM judge)

+
+ + + \ No newline at end of file diff --git a/evaluations/results/pilot-20260324-174404.json b/evaluations/results/pilot-20260324-174404.json new file mode 100644 index 0000000..110f33a --- /dev/null +++ b/evaluations/results/pilot-20260324-174404.json @@ -0,0 +1,10442 @@ +{ + "timestamp": "2026-03-24T17:44:04.891380+00:00", + "config": { + "models": [ + "claude" + ], + "openai_model": null, + "mistral_model": null, + "deepseek_model": null, + "ollama_model": null, + "ollama_url": null, + "no_think": null, + "temperature": 0.0 + }, + "models": { + "claude": [ + { + "label": "adr-according-to-nygard/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "adr-according-to-nygard/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "adr-according-to-nygard/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "arc42/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + } + ] + }, + { + "label": "arc42/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "arc42/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "arc42/consistency-variant-1", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "arc42/consistency-variant-2", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "arc42/consistency-variant-3", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "arc42/consistency-language", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "atam/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "atam/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "atam/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "bdd-given-when-then/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "bdd-given-when-then/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "bdd-given-when-then/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "bem-methodology/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "bem-methodology/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "bem-methodology/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "bluf/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "bluf/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "bluf/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "c4-diagrams/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "c4-diagrams/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "c4-diagrams/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "chain-of-thought/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "chain-of-thought/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "chain-of-thought/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "clean-architecture/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "clean-architecture/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "clean-architecture/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "control-chart-shewhart/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "control-chart-shewhart/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "control-chart-shewhart/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "conventional-commits/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "conventional-commits/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "conventional-commits/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "cqrs/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "cqrs/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "cqrs/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "cynefin-framework/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "cynefin-framework/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "cynefin-framework/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "definition-of-done/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "definition-of-done/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "definition-of-done/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "devils-advocate/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "devils-advocate/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "devils-advocate/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "diataxis-framework/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "diataxis-framework/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "diataxis-framework/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "docs-as-code/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "docs-as-code/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "docs-as-code/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "domain-driven-design/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "domain-driven-design/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "domain-driven-design/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "ears-requirements/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "ears-requirements/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "ears-requirements/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "event-driven-architecture/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "event-driven-architecture/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "event-driven-architecture/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "fagan-inspection/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "fagan-inspection/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "fagan-inspection/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "feynman-technique/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "feynman-technique/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "feynman-technique/application-paraphrase", + "score": 0.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "C", + "correct": false, + "raw_response": "C" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "B", + "correct": false, + "raw_response": "B" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "A", + "correct": false, + "raw_response": "A" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "D", + "correct": false, + "raw_response": "D" + } + ] + }, + { + "label": "five-whys/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "five-whys/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "five-whys/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "fowler-patterns/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "fowler-patterns/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "fowler-patterns/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "gherkin/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + } + ] + }, + { + "label": "gherkin/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "gherkin/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "github-flow/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "github-flow/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "github-flow/application-paraphrase", + "score": 0.75, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "A", + "correct": false, + "raw_response": "A" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "gutes-deutsch-wolf-schneider/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "gutes-deutsch-wolf-schneider/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "gutes-deutsch-wolf-schneider/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "hexagonal-architecture/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "hexagonal-architecture/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "hexagonal-architecture/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "iec-61508-sil-levels/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "iec-61508-sil-levels/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "iec-61508-sil-levels/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "impact-mapping/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "impact-mapping/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "impact-mapping/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "invest/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "invest/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "invest/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "iso-25010/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "iso-25010/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "iso-25010/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "jobs-to-be-done/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "jobs-to-be-done/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "jobs-to-be-done/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "lasr/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "lasr/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "lasr/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "linddun/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "linddun/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "linddun/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "llm-evaluations/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "llm-evaluations/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "llm-evaluations/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "madr/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "madr/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "madr/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "mece/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "mece/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "mece/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "morphological-box/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "morphological-box/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "morphological-box/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "moscow/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "moscow/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "moscow/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "mutation-testing/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "mutation-testing/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "mutation-testing/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "negative-control/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + } + ] + }, + { + "label": "nelson-rules/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "nelson-rules/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "nelson-rules/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "owasp-top-10/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + } + ] + }, + { + "label": "owasp-top-10/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "owasp-top-10/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "plain-english-strunk-white/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "plain-english-strunk-white/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "plain-english-strunk-white/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "prd/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "prd/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "prd/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "problem-space-nvc/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "problem-space-nvc/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "problem-space-nvc/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "property-based-testing/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "property-based-testing/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "property-based-testing/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "pyramid-principle/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "pyramid-principle/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "pyramid-principle/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "sanity-check/recognition", + "score": 0.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "X", + "answer": "B", + "correct": false, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "X", + "answer": null, + "correct": false, + "raw_response": "None of the options provided match the famous answer from" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "X", + "answer": null, + "correct": false, + "raw_response": "None of the options provided match the correct answer from" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "X", + "answer": null, + "correct": false, + "raw_response": "None of the options provided match the famous answer from" + } + ] + }, + { + "label": "semantic-versioning/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "semantic-versioning/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "semantic-versioning/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "socratic-method/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "socratic-method/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "socratic-method/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "sota/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "sota/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "sota/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "spc/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "spc/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "spc/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "stride/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "stride/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "stride/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "swot/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "swot/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "swot/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "tdd-chicago-school/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "tdd-chicago-school/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "tdd-chicago-school/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "tdd-london-school/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "tdd-london-school/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "tdd-london-school/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "tdd-london-school/consistency-variant-1", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "tdd-london-school/consistency-variant-2", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "tdd-london-school/consistency-variant-3", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "tdd-london-school/consistency-language", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "testing-pyramid/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "testing-pyramid/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "testing-pyramid/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "timtowtdi/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + } + ] + }, + { + "label": "timtowtdi/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "timtowtdi/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "todotxt-flavoured-markdown/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "todotxt-flavoured-markdown/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "todotxt-flavoured-markdown/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "user-story-mapping/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "user-story-mapping/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "user-story-mapping/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "wardley-mapping/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "wardley-mapping/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "wardley-mapping/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + } + ] + }, + "duration_seconds": 4862.1 +} \ No newline at end of file diff --git a/evaluations/results/pilot-20260324-190600.json b/evaluations/results/pilot-20260324-190600.json new file mode 100644 index 0000000..613af48 --- /dev/null +++ b/evaluations/results/pilot-20260324-190600.json @@ -0,0 +1,10442 @@ +{ + "timestamp": "2026-03-24T19:06:00.394684+00:00", + "config": { + "models": [ + "mistral" + ], + "openai_model": null, + "mistral_model": "mistral-large-latest", + "deepseek_model": null, + "ollama_model": null, + "ollama_url": null, + "no_think": null, + "temperature": 0.0 + }, + "models": { + "mistral": [ + { + "label": "adr-according-to-nygard/recognition", + "score": 0.75, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "A", + "correct": false, + "raw_response": "A" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "adr-according-to-nygard/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "adr-according-to-nygard/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "arc42/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + } + ] + }, + { + "label": "arc42/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "arc42/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "arc42/consistency-variant-1", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "arc42/consistency-variant-2", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "arc42/consistency-variant-3", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "arc42/consistency-language", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "atam/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "atam/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "atam/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "bdd-given-when-then/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "bdd-given-when-then/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "bdd-given-when-then/application-paraphrase", + "score": 0.5, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": null, + "correct": false, + "raw_response": "ERROR: Error code: 429 - {'object': 'error', 'message': 'Rate limit exceeded', 'type': 'rate_limited', 'param': None, 'code': '1300', 'raw_status_code': 429}" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": null, + "correct": false, + "raw_response": "ERROR: Error code: 429 - {'object': 'error', 'message': 'Rate limit exceeded', 'type': 'rate_limited', 'param': None, 'code': '1300', 'raw_status_code': 429}" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "bem-methodology/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "bem-methodology/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "bem-methodology/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "bluf/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "bluf/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "bluf/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "c4-diagrams/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "c4-diagrams/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "c4-diagrams/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "chain-of-thought/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "chain-of-thought/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "chain-of-thought/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "clean-architecture/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "clean-architecture/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "clean-architecture/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "control-chart-shewhart/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "control-chart-shewhart/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "control-chart-shewhart/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "conventional-commits/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "conventional-commits/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "conventional-commits/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "cqrs/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "cqrs/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "cqrs/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "cynefin-framework/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "cynefin-framework/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "cynefin-framework/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "definition-of-done/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "definition-of-done/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "definition-of-done/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "devils-advocate/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "devils-advocate/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "devils-advocate/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "diataxis-framework/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "diataxis-framework/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "diataxis-framework/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "docs-as-code/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "docs-as-code/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "docs-as-code/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "domain-driven-design/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "domain-driven-design/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "domain-driven-design/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "ears-requirements/recognition", + "score": 0.75, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "C", + "correct": false, + "raw_response": "C)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "ears-requirements/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "ears-requirements/application-paraphrase", + "score": 0.75, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "B", + "correct": false, + "raw_response": "B)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "event-driven-architecture/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "event-driven-architecture/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "event-driven-architecture/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "fagan-inspection/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "fagan-inspection/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "fagan-inspection/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "feynman-technique/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "feynman-technique/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "feynman-technique/application-paraphrase", + "score": 0.75, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "B", + "correct": false, + "raw_response": "B" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "five-whys/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "five-whys/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "five-whys/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "fowler-patterns/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "fowler-patterns/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "fowler-patterns/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "gherkin/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + } + ] + }, + { + "label": "gherkin/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "gherkin/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "github-flow/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "github-flow/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "github-flow/application-paraphrase", + "score": 0.75, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "A", + "correct": false, + "raw_response": "A" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "gutes-deutsch-wolf-schneider/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "gutes-deutsch-wolf-schneider/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "gutes-deutsch-wolf-schneider/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "hexagonal-architecture/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "hexagonal-architecture/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "hexagonal-architecture/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "iec-61508-sil-levels/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "iec-61508-sil-levels/application-anchor", + "score": 0.5, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "B", + "correct": false, + "raw_response": "B" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "D", + "correct": false, + "raw_response": "D" + } + ] + }, + { + "label": "iec-61508-sil-levels/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "impact-mapping/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "impact-mapping/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "impact-mapping/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "invest/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "invest/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "invest/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "iso-25010/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "iso-25010/application-anchor", + "score": 0.75, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "B", + "correct": false, + "raw_response": "B" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "iso-25010/application-paraphrase", + "score": 0.75, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "B", + "correct": false, + "raw_response": "B" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "jobs-to-be-done/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "jobs-to-be-done/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "jobs-to-be-done/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "lasr/recognition", + "score": 0.25, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "A", + "correct": false, + "raw_response": "A" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "C", + "correct": false, + "raw_response": "C" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "A", + "correct": false, + "raw_response": "A" + } + ] + }, + { + "label": "lasr/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "lasr/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "linddun/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "linddun/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "linddun/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "llm-evaluations/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "llm-evaluations/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "llm-evaluations/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "madr/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "madr/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "madr/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "mece/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "mece/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "mece/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "morphological-box/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "morphological-box/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "morphological-box/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "moscow/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "moscow/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "moscow/application-paraphrase", + "score": 0.25, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "A", + "correct": false, + "raw_response": "A" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "D", + "correct": false, + "raw_response": "D" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "B", + "correct": false, + "raw_response": "B" + } + ] + }, + { + "label": "mutation-testing/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "mutation-testing/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "mutation-testing/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "negative-control/recognition", + "score": 0.75, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "D", + "answer": "B", + "correct": false, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + } + ] + }, + { + "label": "nelson-rules/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "nelson-rules/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "nelson-rules/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "owasp-top-10/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + } + ] + }, + { + "label": "owasp-top-10/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "owasp-top-10/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "plain-english-strunk-white/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "plain-english-strunk-white/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "plain-english-strunk-white/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "prd/recognition", + "score": 0.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "D", + "correct": false, + "raw_response": "D" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "C", + "correct": false, + "raw_response": "C" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "B", + "correct": false, + "raw_response": "B" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "A", + "correct": false, + "raw_response": "A" + } + ] + }, + { + "label": "prd/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "prd/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "problem-space-nvc/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "problem-space-nvc/application-anchor", + "score": 0.75, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "C", + "correct": false, + "raw_response": "C" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "problem-space-nvc/application-paraphrase", + "score": 0.75, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "C", + "correct": false, + "raw_response": "C" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "property-based-testing/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "property-based-testing/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "property-based-testing/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "pyramid-principle/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "pyramid-principle/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "pyramid-principle/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "sanity-check/recognition", + "score": 0.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "X", + "answer": null, + "correct": false, + "raw_response": "None of the above options is correct, but the" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "X", + "answer": null, + "correct": false, + "raw_response": "None of the options provided is correct, but the" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "X", + "answer": null, + "correct": false, + "raw_response": "None of the options provided is correct, but the" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "X", + "answer": "C", + "correct": false, + "raw_response": "C" + } + ] + }, + { + "label": "semantic-versioning/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "semantic-versioning/application-anchor", + "score": 0.5, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "C", + "correct": false, + "raw_response": "C" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "D", + "correct": false, + "raw_response": "D" + } + ] + }, + { + "label": "semantic-versioning/application-paraphrase", + "score": 0.75, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "D", + "correct": false, + "raw_response": "D" + } + ] + }, + { + "label": "socratic-method/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "socratic-method/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "socratic-method/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "sota/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "sota/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "sota/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "spc/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "spc/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "spc/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "stride/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "None of the options perfectly describe the **STRIDE" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "stride/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "stride/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "swot/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "swot/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A) SWOT Analysis\nB) Value Chain" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "swot/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "tdd-chicago-school/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "tdd-chicago-school/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "tdd-chicago-school/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "tdd-london-school/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "tdd-london-school/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "tdd-london-school/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "tdd-london-school/consistency-variant-1", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "tdd-london-school/consistency-variant-2", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "tdd-london-school/consistency-variant-3", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "tdd-london-school/consistency-language", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "testing-pyramid/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "testing-pyramid/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "testing-pyramid/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "timtowtdi/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + } + ] + }, + { + "label": "timtowtdi/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "timtowtdi/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "todotxt-flavoured-markdown/recognition", + "score": 0.5, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "A", + "correct": false, + "raw_response": "A" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "D", + "correct": false, + "raw_response": "D" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "todotxt-flavoured-markdown/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "todotxt-flavoured-markdown/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "user-story-mapping/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "user-story-mapping/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "user-story-mapping/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "wardley-mapping/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "wardley-mapping/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "wardley-mapping/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + } + ] + }, + "duration_seconds": 1018.4 +} \ No newline at end of file diff --git a/evaluations/results/pilot-20260324-192413.json b/evaluations/results/pilot-20260324-192413.json new file mode 100644 index 0000000..8c5f85e --- /dev/null +++ b/evaluations/results/pilot-20260324-192413.json @@ -0,0 +1,10442 @@ +{ + "timestamp": "2026-03-24T19:24:13.551875+00:00", + "config": { + "models": [ + "openai" + ], + "openai_model": "gpt-4o", + "mistral_model": null, + "deepseek_model": null, + "ollama_model": null, + "ollama_url": null, + "no_think": null, + "temperature": 0.0 + }, + "models": { + "openai": [ + { + "label": "adr-according-to-nygard/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "adr-according-to-nygard/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "adr-according-to-nygard/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "arc42/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) A 12-section template for standardized software" + } + ] + }, + { + "label": "arc42/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "arc42/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) External interfaces in Section 3 (Context" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "arc42/consistency-variant-1", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) Gernot Starke" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A) Gernot Starke" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D) Gernot Starke" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C) Gernot Starke" + } + ] + }, + { + "label": "arc42/consistency-variant-2", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) Gernot Starke" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A) Gernot Starke" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D) Gernot Starke" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C) Gernot Starke" + } + ] + }, + { + "label": "arc42/consistency-variant-3", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) Gernot Starke" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A) Gernot Starke" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D) Gernot Starke" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C) Gernot Starke" + } + ] + }, + { + "label": "arc42/consistency-language", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "atam/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "atam/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "atam/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) Build a utility tree to prioritize quality scenarios" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D) Build a utility tree to prioritize quality scenarios" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C) Build a utility tree to prioritize quality scenarios" + } + ] + }, + { + "label": "bdd-given-when-then/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "bdd-given-when-then/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "bdd-given-when-then/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D) Organize discovery workshops with the three amigos" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "bem-methodology/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "bem-methodology/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) Use structured class names like `.navbar`," + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C) Use structured class names like `.navbar`," + } + ] + }, + { + "label": "bem-methodology/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) Use structured class names like `.navbar`," + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D) Use structured class names like `.navbar`," + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C) Use structured class names like `.navbar`," + } + ] + }, + { + "label": "bluf/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "bluf/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "bluf/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "c4-diagrams/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A) Four levels of abstraction; : system in" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C) Four levels of abstraction; : system in" + } + ] + }, + { + "label": "c4-diagrams/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "c4-diagrams/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "chain-of-thought/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "chain-of-thought/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "chain-of-thought/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "clean-architecture/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "clean-architecture/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) Define payment processing use cases in the core" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "clean-architecture/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "control-chart-shewhart/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C) Measured value plotted over time; process" + } + ] + }, + { + "label": "control-chart-shewhart/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "control-chart-shewhart/application-paraphrase", + "score": 0.75, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "A", + "correct": false, + "raw_response": "A" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "conventional-commits/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "conventional-commits/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "conventional-commits/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "cqrs/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "cqrs/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "cqrs/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) Create separate optimized data models: a normalized" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A) Create separate optimized data models: a normalized" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C) Create separate optimized data models: a normalized" + } + ] + }, + { + "label": "cynefin-framework/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) Five domains; : best practices apply," + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C) Five domains; : best practices apply," + } + ] + }, + { + "label": "cynefin-framework/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "cynefin-framework/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "definition-of-done/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "definition-of-done/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "definition-of-done/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) Collaboratively create a single, team-wide" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "devils-advocate/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A) Present opposing viewpoints even if not personally held" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "devils-advocate/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "devils-advocate/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A) Systematically argue against your own design by" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "diataxis-framework/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D) Four documentation types; : learning-oriented," + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "diataxis-framework/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) Develop four distinct documentation sections: beginner tutorials" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "diataxis-framework/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) Develop four distinct documentation sections: beginner tutorials" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D) Develop four distinct documentation sections: beginner tutorials" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "docs-as-code/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "docs-as-code/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "docs-as-code/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "domain-driven-design/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "domain-driven-design/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "domain-driven-design/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) Establish a ubiquitous language by working closely with" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "ears-requirements/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "ears-requirements/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) Structure requirements using specific templates: 'The" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D) Structure requirements using specific templates: 'The" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "ears-requirements/application-paraphrase", + "score": 0.75, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) Structure requirements using specific templates: 'The" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "B", + "correct": false, + "raw_response": "B) Create a comprehensive requirements specification document with functional" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C) Structure requirements using specific templates: 'The" + } + ] + }, + { + "label": "event-driven-architecture/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "event-driven-architecture/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "event-driven-architecture/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "fagan-inspection/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "fagan-inspection/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "fagan-inspection/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "feynman-technique/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "feynman-technique/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "feynman-technique/application-paraphrase", + "score": 0.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "C", + "correct": false, + "raw_response": "C)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "B", + "correct": false, + "raw_response": "B" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "A", + "correct": false, + "raw_response": "A" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "A", + "correct": false, + "raw_response": "A" + } + ] + }, + { + "label": "five-whys/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "five-whys/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "five-whys/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "fowler-patterns/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "fowler-patterns/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "fowler-patterns/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) Use a Domain Model pattern for complex business" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D) Use a Domain Model pattern for complex business" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C) Use a Domain Model pattern for complex business" + } + ] + }, + { + "label": "gherkin/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + } + ] + }, + { + "label": "gherkin/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "gherkin/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "github-flow/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) Workflow steps" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A) Workflow steps" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D) Workflow steps" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "github-flow/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "github-flow/application-paraphrase", + "score": 0.75, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "A", + "correct": false, + "raw_response": "A" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "gutes-deutsch-wolf-schneider/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "gutes-deutsch-wolf-schneider/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "gutes-deutsch-wolf-schneider/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "hexagonal-architecture/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "hexagonal-architecture/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "hexagonal-architecture/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "iec-61508-sil-levels/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "iec-61508-sil-levels/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "iec-61508-sil-levels/application-paraphrase", + "score": 0.75, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "D", + "correct": false, + "raw_response": "D" + } + ] + }, + { + "label": "impact-mapping/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D) Goal → Actors → Impacts → Deliver" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "impact-mapping/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) Map the retention goal to key actors (" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C) Map the retention goal to key actors (" + } + ] + }, + { + "label": "impact-mapping/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) Map the retention goal to key actors (" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "invest/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "invest/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "invest/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A) Split this into multiple smaller stories with specific" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D) Split this into multiple smaller stories with specific" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "iso-25010/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "iso-25010/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "iso-25010/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "jobs-to-be-done/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "jobs-to-be-done/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) Interview users about the specific circumstances that led" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "jobs-to-be-done/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) Interview users about the specific circumstances that led" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A) Interview users about the specific circumstances that led" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D) Interview users about the specific circumstances that led" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C) Interview users about the specific circumstances that led" + } + ] + }, + { + "label": "lasr/recognition", + "score": 0.75, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "A", + "correct": false, + "raw_response": "A" + } + ] + }, + { + "label": "lasr/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "lasr/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "linddun/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "linddun/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "linddun/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D) Systematically analyze the system against seven privacy" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C) Systematically analyze the system against seven privacy" + } + ] + }, + { + "label": "llm-evaluations/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "llm-evaluations/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "llm-evaluations/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "madr/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "madr/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A) Document the decision with sections for context," + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D) Document the decision with sections for context," + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "madr/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D) Document the decision with sections for context," + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "mece/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D) Structuring categories so they do not overlap" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "mece/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D) Organize by business capability: User Service" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C) Organize by business capability: User Service" + } + ] + }, + { + "label": "mece/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D) Organize by business capability: User Service" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C) Organize by business capability: User Service" + } + ] + }, + { + "label": "morphological-box/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) Break complex problem into independent parameters/dim" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D) Break complex problem into independent parameters/dim" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C) Break complex problem into independent parameters/dim" + } + ] + }, + { + "label": "morphological-box/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "morphological-box/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "moscow/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "moscow/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "moscow/application-paraphrase", + "score": 0.75, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "C", + "correct": false, + "raw_response": "C" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "mutation-testing/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "mutation-testing/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "mutation-testing/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "negative-control/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + } + ] + }, + { + "label": "nelson-rules/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "nelson-rules/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "nelson-rules/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "owasp-top-10/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + } + ] + }, + { + "label": "owasp-top-10/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "owasp-top-10/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "plain-english-strunk-white/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "plain-english-strunk-white/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "plain-english-strunk-white/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "prd/recognition", + "score": 0.75, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "A", + "correct": false, + "raw_response": "A" + } + ] + }, + { + "label": "prd/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "prd/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) Write a comprehensive document that defines the problem" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D) Write a comprehensive document that defines the problem" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "problem-space-nvc/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) Concrete, objective facts without evaluation or judgment" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "problem-space-nvc/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "problem-space-nvc/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "property-based-testing/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D) Invariants that should always hold; automatic" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "property-based-testing/application-anchor", + "score": 0.75, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) Define mathematical invariants like 'interest calculations" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "A", + "correct": false, + "raw_response": "A" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "property-based-testing/application-paraphrase", + "score": 0.75, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "D", + "correct": false, + "raw_response": "D) Write comprehensive unit tests covering typical financial scenarios" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D) Define mathematical invariants like 'interest calculations" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "pyramid-principle/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "pyramid-principle/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "pyramid-principle/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "sanity-check/recognition", + "score": 0.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "X", + "answer": "A", + "correct": false, + "raw_response": "None of the options provided are correct. The Answer" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "X", + "answer": "A", + "correct": false, + "raw_response": "None of the options provided correspond to the Answer to" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "X", + "answer": "A", + "correct": false, + "raw_response": "None of the options provided correspond to the Answer to" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "X", + "answer": "A", + "correct": false, + "raw_response": "None of the options provided correspond to the Answer to" + } + ] + }, + { + "label": "semantic-versioning/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "semantic-versioning/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A) 3.0.0 - because" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "semantic-versioning/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "socratic-method/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "socratic-method/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "socratic-method/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "sota/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "sota/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "sota/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "spc/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "spc/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "spc/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "stride/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "None of the options provided accurately describe the STRIDE" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "stride/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "stride/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "swot/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "swot/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C) Analyze internal strengths and weaknesses of your current" + } + ] + }, + { + "label": "swot/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) Analyze internal strengths and weaknesses of your current" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "tdd-chicago-school/recognition", + "score": 0.75, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "D", + "correct": false, + "raw_response": "D)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "tdd-chicago-school/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "tdd-chicago-school/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D) Begin with tests for the core pricing calculations" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "tdd-london-school/recognition", + "score": 0.75, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "D", + "correct": false, + "raw_response": "D" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "tdd-london-school/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) Write a test that mocks PaymentGateway and" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "tdd-london-school/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "tdd-london-school/consistency-variant-1", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) Steve Freeman" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A) Steve Freeman" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D) Steve Freeman" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C) Steve Freeman" + } + ] + }, + { + "label": "tdd-london-school/consistency-variant-2", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) Steve Freeman" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A) Steve Freeman" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D) Steve Freeman" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C) Steve Freeman" + } + ] + }, + { + "label": "tdd-london-school/consistency-variant-3", + "score": 0.5, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "C", + "correct": false, + "raw_response": "C) Dan North" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "B", + "correct": false, + "raw_response": "B)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D) Steve Freeman" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C) Steve Freeman" + } + ] + }, + { + "label": "tdd-london-school/consistency-language", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D) Steve Freeman" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "testing-pyramid/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) Three layers; more unit tests, fewer" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A) Three layers; more unit tests, fewer" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "testing-pyramid/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "testing-pyramid/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "timtowtdi/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + } + ] + }, + { + "label": "timtowtdi/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "timtowtdi/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "todotxt-flavoured-markdown/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "todotxt-flavoured-markdown/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "todotxt-flavoured-markdown/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "user-story-mapping/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "user-story-mapping/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "user-story-mapping/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A)" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "wardley-mapping/recognition", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C)" + } + ] + }, + { + "label": "wardley-mapping/application-anchor", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B)" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + }, + { + "label": "wardley-mapping/application-paraphrase", + "score": 1.0, + "results": [ + { + "permutation": [ + "A", + "B", + "C", + "D" + ], + "expected": "B", + "answer": "B", + "correct": true, + "raw_response": "B) Map the payment processing component's position on" + }, + { + "permutation": [ + "B", + "C", + "D", + "A" + ], + "expected": "A", + "answer": "A", + "correct": true, + "raw_response": "A" + }, + { + "permutation": [ + "C", + "D", + "A", + "B" + ], + "expected": "D", + "answer": "D", + "correct": true, + "raw_response": "D)" + }, + { + "permutation": [ + "D", + "A", + "B", + "C" + ], + "expected": "C", + "answer": "C", + "correct": true, + "raw_response": "C" + } + ] + } + ] + }, + "duration_seconds": 938.4 +} \ No newline at end of file diff --git a/evaluations/specs/adr-according-to-nygard.yaml b/evaluations/specs/adr-according-to-nygard.yaml new file mode 100644 index 0000000..95a2e80 --- /dev/null +++ b/evaluations/specs/adr-according-to-nygard.yaml @@ -0,0 +1,39 @@ +anchor: adr-according-to-nygard +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "ADR according to Nygard"? + + ' + options: + A: Comprehensive architecture documentation with twelve standardized sections + covering context, building blocks, runtime, and deployment views + B: Lightweight records that capture a single architecture decision with its + context, decision, status, and consequences in a short, focused format + C: Collaborative review process where stakeholders evaluate architecture tradeoffs + through scenario-based analysis and quality attribute workshops + D: Visual modeling technique that represents software architecture at four levels + of abstraction from system context down to code + correct: B + application: + scenario: Your team is building a microservices platform and needs to choose between + REST APIs and GraphQL for service communication. The decision involves trade-offs + between performance, complexity, team expertise, and future scalability requirements. + anchor_prompt: using ADR according to Nygard + paraphrase_prompt: How should you document this architectural choice to ensure + future team members understand the reasoning and can make informed decisions + about related architecture changes? + options: + A: Create a comprehensive design document with detailed technical specifications, + implementation guidelines, and performance benchmarks that can be updated + as requirements change. + B: Write a concise record with the decision title, current status, context explaining + the forces at play, the chosen solution, and both positive and negative consequences, + then store it immutably with the codebase. + C: Document the decision in the project wiki with a detailed comparison matrix, + stakeholder approval signatures, and a change log for future modifications + to the decision. + D: Prepare a formal architecture review presentation covering the evaluation + criteria, alternative solutions considered, and implementation roadmap, then + archive it in the project repository. + correct: B diff --git a/evaluations/specs/arc42.yaml b/evaluations/specs/arc42.yaml new file mode 100644 index 0000000..7532be9 --- /dev/null +++ b/evaluations/specs/arc42.yaml @@ -0,0 +1,40 @@ +anchor: arc42 +tier: 3 + +questions: + recognition: + question: | + Which of the following best describes "arc42"? + options: + A: A 12-section template for standardized software architecture documentation + B: A framework for automated architecture compliance checking in CI pipelines + C: A UML-based notation for modeling software components and connectors + D: A methodology for evaluating architecture trade-offs against quality attributes + correct: A + + application: + scenario: | + You are starting architecture documentation for a new microservices project. + The team needs to understand the system's external interfaces and deployment. + Where do you document these two concerns? + anchor_prompt: "following arc42" + paraphrase_prompt: "Create structured architecture documentation covering all relevant aspects" + options: + A: Both in a single Architecture Overview chapter + B: External interfaces in Section 3 (Context and Scope), deployment in Section 7 (Deployment View) + C: External interfaces in the API specification, deployment in the runbook + D: Both in Section 5 (Building Block View) at different abstraction levels + correct: B + + consistency: + variants: + - 'Which of the following persons is most closely associated with "arc42"?' + - 'Who created the arc42 architecture documentation template?' + - 'Which author is known for the arc42 documentation framework?' + language_variant: 'Welche Person wird am engsten mit "arc42" assoziiert?' + options: + A: Martin Fowler + B: Gernot Starke + C: Robert C. Martin + D: Simon Brown + correct: B diff --git a/evaluations/specs/atam.yaml b/evaluations/specs/atam.yaml new file mode 100644 index 0000000..a04100b --- /dev/null +++ b/evaluations/specs/atam.yaml @@ -0,0 +1,38 @@ +anchor: atam +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "ATAM"? + + ' + options: + A: Systematic documentation framework for software architecture decisions using + standardized templates that capture context, decision drivers, and consequences + B: Iterative risk-driven methodology for evaluating software architectures through + stakeholder workshops that identify risks, sensitivity points, and tradeoffs + using scenario-based analysis + C: Comprehensive template-based approach for documenting software architecture + across twelve standardized sections from context to deployment + D: Structured code review process with defined roles and phases that inspects + software artifacts against checklists and specifications + correct: B + application: + scenario: Your team is designing a financial trading platform where stakeholders + demand sub-100ms response times for trade execution, 99.99% uptime, and bank-level + security compliance. The architecture team has proposed using microservices + with event sourcing, but concerns have been raised about whether this approach + can simultaneously meet all quality requirements. + anchor_prompt: using ATAM + paraphrase_prompt: What systematic approach would best help evaluate whether the + proposed architecture can achieve the conflicting quality requirements? + options: + A: Conduct load testing on a prototype implementation to measure actual performance + metrics and identify bottlenecks before making architectural decisions. + B: Build a utility tree to prioritize quality scenarios, then analyze how microservices + and event sourcing create tradeoff points between performance, availability, + and security. + C: Create detailed architecture documentation with UML diagrams and have senior + architects review the design against established enterprise patterns. + D: Implement proof-of-concept services for critical components and run security + penetration tests to validate compliance requirements early. + correct: B diff --git a/evaluations/specs/bdd-given-when-then.yaml b/evaluations/specs/bdd-given-when-then.yaml new file mode 100644 index 0000000..4dc2139 --- /dev/null +++ b/evaluations/specs/bdd-given-when-then.yaml @@ -0,0 +1,44 @@ +anchor: bdd-given-when-then +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "BDD (Behavior-Driven Development)"? + + ' + options: + A: Test-first development approach where unit tests are written before implementation + code, focusing on isolated component testing with mock objects to verify interactions + between system boundaries + B: Structured scenario format — Given a precondition, When an action occurs, + Then an expected outcome results; concrete examples as executable specifications + that define system behavior + C: Agile requirement gathering technique that maps user activities and tasks + in chronological order to identify system features and prioritize development + based on user journey workflows + D: Iterative testing methodology that emphasizes state-based verification through + direct assertions on system outputs, promoting comprehensive test coverage + without external dependencies or test doubles + correct: B + application: + scenario: Your team is building an e-commerce checkout system where business stakeholders + are concerned about complex discount rules and payment validation logic. The + product owner, developers, and QA engineers have different interpretations of + how promotional codes should work with various payment methods. + anchor_prompt: using BDD (Behavior-Driven Development) + paraphrase_prompt: to ensure all stakeholders share the same understanding of + system behavior and create executable documentation + options: + A: Write detailed technical specifications first, then have developers implement + unit tests that verify the discount calculation algorithms work correctly + for each payment method combination. + B: Organize discovery workshops with the three amigos to write Given-When-Then + scenarios like 'Given a customer has a 20% discount code, When they checkout + with a credit card, Then the discount applies before payment processing' that + become executable tests. + C: Create comprehensive user stories with acceptance criteria, then have QA + engineers write end-to-end test scripts that validate the complete checkout + workflow from the user interface perspective. + D: Develop a prototype of the checkout system quickly, then gather feedback + from stakeholders through usability testing sessions to refine the discount + and payment features iteratively. + correct: B diff --git a/evaluations/specs/bem-methodology.yaml b/evaluations/specs/bem-methodology.yaml new file mode 100644 index 0000000..e6a7dfd --- /dev/null +++ b/evaluations/specs/bem-methodology.yaml @@ -0,0 +1,42 @@ +anchor: bem-methodology +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "BEM Methodology"? + + ' + options: + A: A software architecture pattern that separates business logic, event handling, + and model validation to create scalable enterprise applications with clear + separation of concerns + B: Solve CSS specificity wars, naming conflicts, and stylesheet maintainability + issues in large codebases; standalone component that is meaningful on its + own (e.g., `menu`, `button`, `header`) + C: A project management methodology that emphasizes iterative development cycles, + continuous integration, and stakeholder feedback to deliver software products + efficiently + D: A database design approach that structures entities, relationships, and metadata + to optimize query performance and maintain data integrity across distributed + systems + correct: B + application: + scenario: You're developing a navigation component for an e-commerce website that + includes a logo, menu items, search functionality, and a shopping cart icon. + The navigation needs to support different states like active menu items, disabled + search when no products are available, and a compact version for mobile devices. + anchor_prompt: using BEM Methodology + paraphrase_prompt: structure the CSS class names to ensure maintainability, avoid + naming conflicts, and clearly express component relationships + options: + A: Use semantic class names like `.navigation`, `.logo`, `.menu-link`, `.search-box`, + `.cart`, `.active-link`, `.disabled-search`, `.mobile-nav` + B: Use structured class names like `.navbar`, `.navbar__logo`, `.navbar__menu-item`, + `.navbar__search`, `.navbar__cart`, `.navbar__menu-item--active`, `.navbar__search--disabled`, + `.navbar--compact` + C: Use hierarchical class names like `.nav`, `.nav .logo`, `.nav .menu .item`, + `.nav .search.box`, `.nav .cart.icon`, `.nav .menu .item.active`, `.nav .search.disabled`, + `.nav.mobile` + D: Use descriptive class names like `.main-navigation`, `.site-logo`, `.primary-menu-link`, + `.product-search-input`, `.shopping-cart-button`, `.current-page-link`, `.inactive-search-field`, + `.mobile-navigation-bar` + correct: B diff --git a/evaluations/specs/bluf.yaml b/evaluations/specs/bluf.yaml new file mode 100644 index 0000000..4e20a6a --- /dev/null +++ b/evaluations/specs/bluf.yaml @@ -0,0 +1,38 @@ +anchor: bluf +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "BLUF (Bottom Line Up Front)"? + + ' + options: + A: Begin with background context and build logically toward the final conclusion; + comprehensive analysis precedes recommendations + B: State the main point, decision, or recommendation immediately; most important + information first, supporting details follow + C: Present multiple solutions in order of complexity; start with simple approaches + before advancing to detailed implementations + D: Structure information using inverted pyramid with broad overview first, then + progressively narrow to specific technical details + correct: B + application: + scenario: Your team has discovered a critical security vulnerability in the production + API that could expose user data. You need to send an urgent email to the CTO + and engineering leadership about the issue, its impact, and the proposed fix. + anchor_prompt: using BLUF (Bottom Line Up Front) + paraphrase_prompt: to communicate the most critical information first for time-pressed + executives who need to make immediate decisions + options: + A: Start with background context about recent security audits, then explain + how the vulnerability was discovered, detail the technical analysis process, + and conclude with the severity assessment and recommended actions. + B: 'Lead with: ''Critical API vulnerability requires immediate hotfix deployment + by EOD to prevent potential user data exposure.'' Follow with impact details, + technical specifics, and implementation timeline.' + C: Begin by outlining the discovery timeline, present a detailed technical analysis + of the vulnerability, discuss various potential solutions, then provide your + final recommendation and next steps. + D: Open with team credentials and recent security improvements, explain the + systematic approach used to identify issues, walk through the vulnerability + details, and end with proposed solutions. + correct: B diff --git a/evaluations/specs/c4-diagrams.yaml b/evaluations/specs/c4-diagrams.yaml new file mode 100644 index 0000000..9113143 --- /dev/null +++ b/evaluations/specs/c4-diagrams.yaml @@ -0,0 +1,39 @@ +anchor: c4-diagrams +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "C4-Diagrams"? + + ' + options: + A: 'Four components of system design: data, process, interface, and security + layers' + B: 'Four levels of abstraction; : system in its environment (users, external + systems)' + C: 'Four phases of software development: requirements, design, implementation, + and testing' + D: 'Four categories of architectural patterns: layered, client-server, pipe-filter, + and event-driven' + correct: B + application: + scenario: Your team is building a new e-commerce platform and needs to present + the architecture to various stakeholders including executives, developers, and + operations staff. The system involves web applications, mobile apps, payment + services, inventory databases, and third-party shipping APIs. + anchor_prompt: using C4-Diagrams + paraphrase_prompt: create a comprehensive architectural documentation strategy + that effectively communicates system structure to all stakeholder groups + options: + A: Create detailed UML class diagrams showing all system interfaces, then add + deployment diagrams and sequence diagrams for each major user workflow to + ensure complete technical coverage. + B: Start with a context diagram showing the system and external actors, then + create container diagrams for applications and databases, followed by component + diagrams for complex containers as needed. + C: Begin with a comprehensive system landscape diagram, then create detailed + data flow diagrams, followed by network topology diagrams and API specification + documents for each service. + D: Design entity-relationship diagrams for all databases first, then create + service architecture diagrams, and finish with user journey maps and technical + infrastructure blueprints. + correct: B diff --git a/evaluations/specs/chain-of-thought.yaml b/evaluations/specs/chain-of-thought.yaml new file mode 100644 index 0000000..bdc5550 --- /dev/null +++ b/evaluations/specs/chain-of-thought.yaml @@ -0,0 +1,36 @@ +anchor: chain-of-thought +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "Chain of Thought (CoT)"? + + ' + options: + A: Connect multiple AI models in sequence where each model's output becomes + the input for the next model in the processing pipeline + B: Explicitly show intermediate reasoning steps before reaching a conclusion; + make the thought process visible, not just the final answer + C: Structure prompts using a series of related examples that progressively guide + the model toward the desired response pattern + D: Break down complex problems into smaller, independent sub-problems that can + be solved separately and then combined for the final solution + correct: B + application: + scenario: Your team is debugging a complex data processing pipeline that produces + incorrect results for certain edge cases. The LLM-based component seems to jump + directly to conclusions without showing its reasoning process. You need to modify + the prompting strategy to make the model's decision-making process visible so + you can identify where the logic breaks down. + anchor_prompt: using Chain of Thought (CoT) + paraphrase_prompt: Which prompting approach would best help you understand and + debug the model's reasoning process for complex multi-step problems? + options: + A: Add more examples to the prompt with only the final correct answers, then + use temperature=0 for consistent outputs across all test cases. + B: Modify prompts to include phrases like 'Let's think step by step' and provide + examples that show intermediate reasoning steps before reaching conclusions. + C: Increase the context window size and provide comprehensive background documentation + about all possible edge cases and their solutions. + D: Use ensemble methods by running multiple model instances with different random + seeds and selecting the most frequently occurring answer. + correct: B diff --git a/evaluations/specs/clean-architecture.yaml b/evaluations/specs/clean-architecture.yaml new file mode 100644 index 0000000..42eaf0b --- /dev/null +++ b/evaluations/specs/clean-architecture.yaml @@ -0,0 +1,39 @@ +anchor: clean-architecture +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "Clean Architecture"? + + ' + options: + A: Dependencies flow bidirectionally between layers; presentation ↔ business + logic ↔ data access ↔ external services + B: Dependencies only point inward; entities → use cases → interface adapters + → frameworks & drivers + C: Code is organized by technical concerns; controllers → services → repositories + → database models + D: System components are loosely coupled through message passing; publishers + → message brokers → subscribers → event handlers + correct: B + application: + scenario: Your team is developing an e-commerce platform that needs to support + multiple payment processors (Stripe, PayPal, Square) and may need to switch + between them based on business requirements. The payment processing logic contains + complex fraud detection rules and transaction validation that must remain consistent + regardless of which payment provider is used. + anchor_prompt: using Clean Architecture + paraphrase_prompt: to ensure the core business logic remains independent of external + payment providers while maintaining flexibility to switch between them + options: + A: Create a shared payment utility class that contains all provider-specific + code and business rules, then inject different configuration objects to switch + between providers + B: Define payment processing use cases in the core layer with abstract interfaces, + implement provider-specific adapters in the outer layer, and inject dependencies + inward through dependency inversion + C: Build separate microservices for each payment provider with a central orchestrator + service that routes requests and handles all business logic validation + D: Implement a factory pattern that returns different payment processor instances, + with each processor containing its own copy of the fraud detection and validation + logic + correct: B diff --git a/evaluations/specs/control-chart-shewhart.yaml b/evaluations/specs/control-chart-shewhart.yaml new file mode 100644 index 0000000..2774073 --- /dev/null +++ b/evaluations/specs/control-chart-shewhart.yaml @@ -0,0 +1,32 @@ +anchor: control-chart-shewhart +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "Control Chart (Shewhart)"? + + ' + options: + A: Graphical representation of system architecture components and their dependencies + B: Measured value plotted over time; process mean + C: Visual workflow diagram showing sequential steps in a development process + D: Matrix displaying test coverage metrics across different software modules + correct: B + application: + scenario: Your team is monitoring API response times for a critical microservice + that handles user authentication. Over the past month, you've collected response + time measurements every hour during business hours. The service occasionally + experiences unexplained spikes in response time that affect user experience. + anchor_prompt: using Control Chart (Shewhart) + paraphrase_prompt: to systematically distinguish between normal performance fluctuations + and genuine performance issues that require investigation + options: + A: Set fixed thresholds at 95th and 99th percentiles of historical data, then + alert whenever current measurements exceed these static boundaries + B: Plot response times over time with a centerline at the process mean and control + limits at ±3 standard deviations, then investigate points outside these limits + or patterns within the limits + C: Use machine learning anomaly detection to automatically identify outliers + based on complex multivariate patterns in the time series data + D: Calculate rolling averages over 24-hour windows and trigger alerts when the + current average deviates more than 20% from the previous day's average + correct: B diff --git a/evaluations/specs/conventional-commits.yaml b/evaluations/specs/conventional-commits.yaml new file mode 100644 index 0000000..a667196 --- /dev/null +++ b/evaluations/specs/conventional-commits.yaml @@ -0,0 +1,44 @@ +anchor: conventional-commits +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "Conventional Commits"? + + ' + options: + A: A branching strategy where feature branches follow naming conventions like + feature/JIRA-123 with mandatory code review before merging to main + B: '[!][(optional scope)]: + optional body/footer; common + types' + C: A software architecture pattern that enforces strict separation between business + logic and infrastructure through standardized interface contracts + D: A deployment methodology that requires all releases to pass through predefined + stages with automated gates and rollback capabilities + correct: B + application: + scenario: Your team is preparing to release version 2.1.3 of your API library. + During code review, you notice that one developer's pull request removes a deprecated + method that some users might still be calling, while another developer's PR + adds a new optional parameter to an existing function. The team lead wants all + commit messages to clearly indicate how these changes should affect the next + version number. + anchor_prompt: using Conventional Commits + paraphrase_prompt: structure the commit messages to clearly communicate the semantic + versioning impact of these changes + options: + A: 'feat: remove deprecated getUserData method and add timeout parameter to + fetchUser function' + B: 'feat!: remove deprecated getUserData method + + + BREAKING CHANGE: getUserData method no longer available + + + feat: add optional timeout parameter to fetchUser function' + C: 'refactor: remove getUserData method + + + enhancement: add timeout parameter to fetchUser function' + D: 'chore: clean up deprecated getUserData method and improve fetchUser function + with timeout support' + correct: B diff --git a/evaluations/specs/cqrs.yaml b/evaluations/specs/cqrs.yaml new file mode 100644 index 0000000..36bd963 --- /dev/null +++ b/evaluations/specs/cqrs.yaml @@ -0,0 +1,46 @@ +anchor: cqrs +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "CQRS (Command Query Responsibility + Segregation)"? + + ' + options: + A: An architectural pattern that separates business logic from infrastructure + concerns by defining ports and adapters, where the core domain remains independent + of external systems and frameworks + B: Bertrand Meyer's principle — methods either change state (commands) or return + data (queries), never both; write operations that change state and return + void; represent intent as immutable command objects + C: A design approach where complex business domains are modeled through ubiquitous + language, bounded contexts, and aggregate roots to align software structure + with business requirements + D: A distributed system pattern that ensures data consistency across microservices + by coordinating transactions through a central orchestrator that manages compensating + actions for failures + correct: B + application: + scenario: Your e-commerce platform handles millions of product searches daily + but only thousands of inventory updates per hour. The current unified data model + causes performance bottlenecks as complex search queries with filters, sorting, + and recommendations compete for database resources with critical inventory management + operations. + anchor_prompt: using CQRS (Command Query Responsibility Segregation) + paraphrase_prompt: How would you architect the system to optimize both the high-volume + search operations and the critical inventory updates without them interfering + with each other? + options: + A: Implement database sharding to distribute both search and inventory operations + across multiple database instances, using product categories as the sharding + key to balance the load evenly. + B: 'Create separate optimized data models: a normalized write model for inventory + commands and denormalized read models for search queries, synchronized through + domain events with eventual consistency.' + C: Use a master-slave database replication setup where all inventory updates + go to the master and search queries are distributed across multiple read replicas + to reduce contention. + D: Implement a caching layer with Redis to store frequently accessed product + data and search results, reducing database load while maintaining a single + unified data model. + correct: B diff --git a/evaluations/specs/cynefin-framework.yaml b/evaluations/specs/cynefin-framework.yaml new file mode 100644 index 0000000..6bccb1a --- /dev/null +++ b/evaluations/specs/cynefin-framework.yaml @@ -0,0 +1,39 @@ +anchor: cynefin-framework +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "Cynefin Framework"? + + ' + options: + A: Strategic mapping technique that visualizes value chains and component evolution + stages over time + B: 'Five domains; : best practices apply, sense-categorize-respond' + C: 'Agile methodology framework with four iterative phases: plan-do-check-act + for continuous improvement' + D: 'Decision-making model using three assessment layers: context-analysis-action + for complex problems' + correct: B + application: + scenario: 'Your team is facing three different challenges: a well-understood database + migration that follows established procedures, intermittent performance issues + that require expert analysis, and completely unpredictable user behavior patterns + in a new AI feature. The team is debating how to approach each problem and allocate + resources effectively.' + anchor_prompt: using Cynefin Framework + paraphrase_prompt: categorize these challenges by their complexity characteristics + to determine the most appropriate decision-making approach for each + options: + A: Treat all three as technical problems requiring expert analysis, form specialized + teams for each, and conduct thorough requirements gathering before taking + action on any of them. + B: Apply best practices to the database migration, assign experts to analyze + the performance issues, and run small experiments to understand the AI feature + behavior patterns. + C: Prioritize all three challenges by business impact, assign the most experienced + developers to each, and create detailed project plans with fixed timelines + for resolution. + D: Escalate all three issues to senior architects for decision-making, document + comprehensive risk assessments, and implement the solutions with the highest + confidence levels first. + correct: B diff --git a/evaluations/specs/definition-of-done.yaml b/evaluations/specs/definition-of-done.yaml new file mode 100644 index 0000000..bd1b96c --- /dev/null +++ b/evaluations/specs/definition-of-done.yaml @@ -0,0 +1,42 @@ +anchor: definition-of-done +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "Definition of Done"? + + ' + options: + A: A comprehensive project timeline that outlines all deliverables, milestones, + and dependencies from project initiation through final deployment; includes + resource allocation, risk assessments, and stakeholder sign-off requirements + B: A formal, team-wide checklist of quality criteria that every increment must + satisfy before it is declared "done"; concrete, verifiable conditions — e.g., + code reviewed, tests passing, documentation updated, no known defects + C: A structured template for capturing user requirements in the format 'As a + [user type], I want [functionality] so that [business value]'; includes acceptance + criteria, story points, and priority rankings for backlog management + D: A prioritization framework that categorizes requirements into Must have, + Should have, Could have, and Won't have categories; helps teams focus on essential + features while managing scope and stakeholder expectations effectively + correct: B + application: + scenario: Your agile team has been experiencing issues with features being marked + as complete during sprints, only to discover missing documentation, failing + edge case tests, or incomplete code reviews during the final sprint review. + The Product Owner is frustrated because features appear done in daily standups + but aren't actually ready for release. + anchor_prompt: using Definition of Done + paraphrase_prompt: establish a shared understanding of what constitutes truly + completed work to prevent late-cycle surprises and ensure consistent quality + standards + options: + A: Create individual checklists for each team member based on their role and + expertise, allowing developers to focus on code while QA handles testing criteria + B: Collaboratively create a single, team-wide checklist of quality criteria + that every increment must satisfy before being declared complete, including + code review, tests passing, and documentation updated + C: Have the Product Owner define completion criteria for each user story individually + based on business value and customer requirements + D: Implement a post-sprint quality gate where a designated team lead reviews + all completed work and decides what meets release standards + correct: B diff --git a/evaluations/specs/devils-advocate.yaml b/evaluations/specs/devils-advocate.yaml new file mode 100644 index 0000000..15397f4 --- /dev/null +++ b/evaluations/specs/devils-advocate.yaml @@ -0,0 +1,35 @@ +anchor: devils-advocate +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "Devil''s Advocate"? + + ' + options: + A: Systematically identify potential failure points and weaknesses in a system + design before implementation begins + B: Present opposing viewpoints even if not personally held; question premises + and surface hidden assumptions + C: Challenge team members' technical decisions through aggressive questioning + to test their knowledge and expertise + D: Assign responsibility for identifying risks and negative outcomes to a designated + team member during planning sessions + correct: B + application: + scenario: Your team has designed a new microservices architecture that will replace + the current monolithic system. The design has been well-received by stakeholders + and addresses all known requirements. Before finalizing the architecture decision, + you want to ensure you haven't overlooked critical issues. + anchor_prompt: using Devil's Advocate + paraphrase_prompt: What approach should you take to identify potential weaknesses + in your architecture design before implementation? + options: + A: Conduct a final walkthrough with stakeholders to confirm the design meets + all their stated requirements and get formal sign-off + B: Systematically argue against your own design by presenting the strongest + possible case for why this architecture could fail or cause problems + C: Create detailed implementation timelines and resource estimates to validate + the feasibility of the proposed architecture + D: Research similar architectures used by other companies to benchmark your + design against industry best practices + correct: B diff --git a/evaluations/specs/diataxis-framework.yaml b/evaluations/specs/diataxis-framework.yaml new file mode 100644 index 0000000..e9fe327 --- /dev/null +++ b/evaluations/specs/diataxis-framework.yaml @@ -0,0 +1,37 @@ +anchor: diataxis-framework +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "Diátaxis Framework"? + + ' + options: + A: Agile methodology framework focusing on iterative development cycles with + continuous stakeholder feedback loops + B: 'Four documentation types; : learning-oriented, lessons for beginners' + C: Software architecture pattern that separates presentation, business logic, + and data access into distinct layers + D: Project management approach emphasizing cross-functional teams and rapid + prototyping for complex systems + correct: B + application: + scenario: Your team has developed a new API authentication library and needs to + create comprehensive documentation. Users are complaining that they can't find + what they need - some want to learn the basics, others need quick solutions + to specific problems, and developers need detailed technical specifications. + anchor_prompt: using Diátaxis Framework + paraphrase_prompt: organize the documentation to systematically address different + user needs and purposes + options: + A: Create a single comprehensive guide that covers everything from basic concepts + to advanced implementation details, organized by feature complexity from simple + to advanced use cases. + B: 'Develop four distinct documentation sections: beginner tutorials for learning, + task-specific how-to guides, complete API reference materials, and conceptual + explanations of authentication principles.' + C: Structure documentation around user personas, creating separate sections + for frontend developers, backend developers, security engineers, and project + managers with role-specific information. + D: 'Organize content chronologically following the typical development workflow: + planning, setup, implementation, testing, deployment, and maintenance phases.' + correct: B diff --git a/evaluations/specs/docs-as-code.yaml b/evaluations/specs/docs-as-code.yaml new file mode 100644 index 0000000..1c7c807 --- /dev/null +++ b/evaluations/specs/docs-as-code.yaml @@ -0,0 +1,38 @@ +anchor: docs-as-code +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "Docs-as-Code according to Ralf + D. Müller"? + + ' + options: + A: Writing documentation in a wiki with WYSIWYG editing and real-time collaboration + B: 'Treating documentation like source code: version-controlled, peer-reviewed, + and built automatically' + C: Generating API documentation automatically from code annotations and docstrings + D: Maintaining a separate documentation repository with its own release cycle + correct: B + application: + scenario: Your development team maintains a microservices platform with complex + API documentation that frequently becomes outdated when code changes. The team + uses Git for version control and has automated CI/CD pipelines. Management wants + documentation that stays synchronized with code changes and can be generated + in multiple formats for different stakeholders. + anchor_prompt: using Docs-as-Code according to Ralf D. Müller + paraphrase_prompt: How should you structure and manage the documentation workflow + to ensure it remains current and accessible? + options: + A: Create a centralized wiki system with automated API extraction, assign documentation + ownership to technical writers, and schedule weekly documentation reviews + to ensure accuracy across all services. + B: Write documentation in AsciiDoc format stored in Git repositories alongside + code, implement docToolchain with Gradle automation, use PlantUML for diagrams, + and require documentation updates in every pull request. + C: Implement a headless CMS with version control integration, create documentation + templates in Microsoft Word, and establish a quarterly documentation sprint + cycle with stakeholder review sessions. + D: Set up Confluence spaces linked to JIRA tickets, use embedded Lucidchart + diagrams, create documentation branches that merge after code releases, and + maintain separate review cycles for docs and code. + correct: B diff --git a/evaluations/specs/domain-driven-design.yaml b/evaluations/specs/domain-driven-design.yaml new file mode 100644 index 0000000..e50d907 --- /dev/null +++ b/evaluations/specs/domain-driven-design.yaml @@ -0,0 +1,41 @@ +anchor: domain-driven-design +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "Domain-Driven Design according + to Evans"? + + ' + options: + A: Architectural pattern that separates business logic into distinct layers + with clear interfaces between presentation, application, and data access components + B: Shared vocabulary between developers and domain experts; explicit boundaries + where a model is defined and applicable + C: Software development methodology that emphasizes iterative delivery of working + software through close collaboration between cross-functional teams and stakeholders + D: Design approach that focuses on creating reusable software components with + well-defined interfaces that can be composed into larger systems + correct: B + application: + scenario: Your team is building a complex insurance claims processing system where + business rules frequently change and involve multiple departments (underwriting, + claims adjustment, fraud detection, customer service). The business stakeholders + use terms like 'policy holder,' 'coverage limits,' and 'claim settlement' but + developers are implementing these concepts inconsistently across different parts + of the system. + anchor_prompt: using Domain-Driven Design according to Evans + paraphrase_prompt: to ensure consistent understanding and implementation of business + concepts across the development team and stakeholders + options: + A: Create comprehensive technical documentation that maps business terms to + database schemas and API endpoints, then train all developers on the correct + technical implementations + B: Establish a ubiquitous language by working closely with domain experts to + define shared vocabulary, then ensure this language is consistently used in + code, conversations, and models + C: Implement a centralized data dictionary service that validates all business + term usage across microservices and enforces standardized naming conventions + D: Organize regular cross-functional meetings where business stakeholders explain + requirements to developers using standardized requirement templates and acceptance + criteria + correct: B diff --git a/evaluations/specs/ears-requirements.yaml b/evaluations/specs/ears-requirements.yaml new file mode 100644 index 0000000..a87c03c --- /dev/null +++ b/evaluations/specs/ears-requirements.yaml @@ -0,0 +1,40 @@ +anchor: ears-requirements +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "EARS-Requirements"? + + ' + options: + A: A systematic approach for evaluating and analyzing requirements through stakeholder + interviews and documentation review + B: '"The shall "; "when the shall "' + C: 'A framework for organizing requirements into hierarchical categories: Essential, + Auxiliary, Regulatory, and Supplementary' + D: A validation methodology that ensures requirements are Explicit, Achievable, + Relevant, and Specific before implementation + correct: B + application: + scenario: You are documenting requirements for a medical device monitoring system + that tracks patient vital signs. The system must handle various operational + states, emergency conditions, and optional features like wireless connectivity. + Your team needs clear, testable requirements that will support regulatory approval + and system verification. + anchor_prompt: using EARS-Requirements + paraphrase_prompt: structure these requirements using a systematic template-based + approach that ensures clarity and testability + options: + A: Write detailed user stories with acceptance criteria, organize them by epic + and priority, and include definition of done for each story to ensure the + development team understands the business value. + B: 'Structure requirements using specific templates: ''The system shall...'' + for basic functions, ''WHEN alarm triggered the system shall...'' for events, + ''WHILE monitoring the system shall...'' for states, and ''IF battery low + THEN the system shall...'' for conditions.' + C: Create a comprehensive requirements specification document with functional + and non-functional sections, include use case diagrams, and establish a requirements + traceability matrix linking to test cases. + D: Define requirements as measurable objectives with key performance indicators, + establish SMART criteria for each requirement, and create a validation framework + with quantitative success metrics. + correct: B diff --git a/evaluations/specs/event-driven-architecture.yaml b/evaluations/specs/event-driven-architecture.yaml new file mode 100644 index 0000000..81ea3a9 --- /dev/null +++ b/evaluations/specs/event-driven-architecture.yaml @@ -0,0 +1,44 @@ +anchor: event-driven-architecture +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "Event-Driven Architecture"? + + ' + options: + A: Components are organized around business domains with clear boundaries, where + each domain contains its own models, services, and data stores that reflect + real-world business concepts + B: Components communicate by emitting and reacting to events rather than direct + calls; producers publish events without knowing which consumers will process + them + C: Components are structured in concentric layers with business logic at the + center, isolated from external concerns through dependency inversion and interface + abstractions + D: Components communicate through well-defined interfaces at the boundaries + while keeping core business logic independent of external frameworks, databases, + and user interfaces + correct: B + application: + scenario: Your e-commerce platform needs to handle order processing, inventory + updates, payment processing, and shipping notifications. Currently, the order + service directly calls the inventory service, payment service, and shipping + service synchronously, causing delays and tight coupling between services. + anchor_prompt: using Event-Driven Architecture + paraphrase_prompt: How would you redesign this system to reduce coupling between + services and improve scalability while ensuring all necessary business processes + still execute when orders are placed? + options: + A: Create a centralized order orchestrator service that manages the workflow + by making sequential API calls to inventory, payment, and shipping services + with retry logic and circuit breakers. + B: Have the order service publish an 'OrderPlaced' event to a message queue, + with inventory, payment, and shipping services subscribing to process their + respective tasks independently and asynchronously. + C: Implement a shared database that all services can read from and write to, + with database triggers that automatically update related tables when orders + are inserted. + D: Use a microservices gateway that routes requests to the appropriate services + and aggregates responses, with caching layers to improve performance between + service calls. + correct: B diff --git a/evaluations/specs/fagan-inspection.yaml b/evaluations/specs/fagan-inspection.yaml new file mode 100644 index 0000000..9dc9b40 --- /dev/null +++ b/evaluations/specs/fagan-inspection.yaml @@ -0,0 +1,42 @@ +anchor: fagan-inspection +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "Fagan Inspection"? + + ' + options: + A: A systematic code mutation technique where defects are artificially introduced + into software modules to evaluate the effectiveness of existing test suites + and identify gaps in test coverage. + B: A structured, multi-phase review process for software artifacts (requirements, + design, code) with defined roles and entry/exit criteria; moderator (facilitates + and logs), author (created the artifact), inspectors (reviewers), recorder + (documents defects) + C: A formal verification methodology that uses mathematical proofs and static + analysis to demonstrate software correctness without executing the program, + focusing on pre-conditions and post-conditions. + D: A risk-based assessment framework for evaluating software architecture decisions + through scenario-based analysis, stakeholder interviews, and systematic documentation + of trade-offs and quality attributes. + correct: B + application: + scenario: Your team is developing flight control software for a commercial aircraft. + The requirements document has been completed and needs to be reviewed before + the design phase begins. Several team members have expressed concerns about + potential ambiguities and missing edge cases in the requirements. + anchor_prompt: using Fagan Inspection + paraphrase_prompt: What structured approach should you take to systematically + review the requirements document with your team? + options: + A: Schedule a team meeting where everyone reads through the requirements together + and discusses any issues they notice during the session. + B: Assign specific roles including a moderator and recorder, have each inspector + study the requirements individually beforehand, then hold a formal meeting + to identify and classify defects systematically. + C: Distribute the requirements to all team members via email and ask them to + send back their comments within a week, then compile all feedback into a single + document. + D: Have the requirements author present the document to the team in a walkthrough + session where attendees can ask questions and suggest improvements in real-time. + correct: B diff --git a/evaluations/specs/feynman-technique.yaml b/evaluations/specs/feynman-technique.yaml new file mode 100644 index 0000000..7e2eb30 --- /dev/null +++ b/evaluations/specs/feynman-technique.yaml @@ -0,0 +1,38 @@ +anchor: feynman-technique +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "Feynman Technique"? + + ' + options: + A: Break down complex problems into smaller, manageable components by creating + visual diagrams that map dependencies and identify potential bottlenecks in + the system + B: Teach the concept in simple language as if to a beginner (traditionally "explain + to a 12-year-old"); when you struggle to explain, you've found gaps in your + understanding + C: Use rapid prototyping and iterative feedback loops to validate assumptions + early in the development process before committing to full implementation + D: Apply the principle of progressive disclosure by revealing information gradually + to users, starting with the most essential features and adding complexity + as needed + correct: B + application: + scenario: Sarah is a senior developer who needs to understand a complex distributed + caching system before implementing a critical feature. She's read the documentation + and architecture diagrams, but feels uncertain about key concepts like cache + coherency protocols and distributed consensus mechanisms. + anchor_prompt: using Feynman Technique + paraphrase_prompt: to identify and fill knowledge gaps about the caching system + options: + A: Create detailed technical diagrams mapping all system components and their + interactions, then review them with the architecture team to ensure accuracy. + B: Write a simple explanation of how the caching system works as if teaching + it to a junior developer, noting where she struggles to explain clearly, then + study those areas more deeply. + C: Build a small prototype implementation to test her assumptions about the + system behavior and identify any gaps through hands-on experimentation. + D: Schedule meetings with the original system architects to ask detailed questions + about implementation decisions and document their responses. + correct: B diff --git a/evaluations/specs/five-whys.yaml b/evaluations/specs/five-whys.yaml new file mode 100644 index 0000000..a246900 --- /dev/null +++ b/evaluations/specs/five-whys.yaml @@ -0,0 +1,40 @@ +anchor: five-whys +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "Five Whys (Ohno)"? + + ' + options: + A: Create five alternative solution paths and systematically evaluate each option + against predefined criteria to select the optimal approach + B: Ask "Why?" repeatedly (typically ~5 times) to drill down to root causes; + distinguish between surface symptoms and underlying causes + C: Break down complex problems into exactly five manageable components and address + each component using dedicated team resources + D: Conduct five rounds of stakeholder interviews to gather comprehensive requirements + and validate assumptions before implementation + correct: B + application: + scenario: Your team's automated deployment pipeline has failed three times this + week, each time requiring manual intervention to complete the release. The immediate + cause appears to be intermittent network timeouts during the artifact upload + phase, but previous attempts to increase timeout values haven't resolved the + underlying issue. + anchor_prompt: using Five Whys (Ohno) + paraphrase_prompt: to systematically drill down from surface symptoms to identify + the actionable root cause of these recurring deployment failures + options: + A: Document all three failure instances, categorize the types of network errors, + and create a comprehensive troubleshooting runbook for future occurrences + of similar timeout issues. + B: Ask why network timeouts occur, then why that cause exists, continuing this + questioning process until you reach an underlying cause that the team can + take concrete action to prevent. + C: Gather the development, infrastructure, and network teams to brainstorm all + possible factors contributing to deployment failures and create a fishbone + diagram mapping relationships between causes. + D: Implement monitoring dashboards to track network latency patterns, set up + automated alerts for timeout thresholds, and establish escalation procedures + for deployment pipeline failures. + correct: B diff --git a/evaluations/specs/fowler-patterns.yaml b/evaluations/specs/fowler-patterns.yaml new file mode 100644 index 0000000..50e4b29 --- /dev/null +++ b/evaluations/specs/fowler-patterns.yaml @@ -0,0 +1,43 @@ +anchor: fowler-patterns +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "Patterns of Enterprise Application + Architecture (PEAA)"? + + ' + options: + A: A comprehensive framework defining bounded contexts, aggregates, entities, + value objects, repositories, and domain services for implementing complex + business logic in enterprise systems + B: Transaction Script, Domain Model, Table Module, Service Layer; table data + gateway, row data gateway, active record, data mapper + C: An architectural approach emphasizing ports and adapters, dependency inversion, + use cases, interactors, and clean separation between business rules and external + frameworks or databases + D: A layered architecture pattern consisting of presentation layer, business + logic layer, data access layer, and cross-cutting concerns like logging, security, + and transaction management + correct: B + application: + scenario: Your team is building an e-commerce platform where customer orders involve + complex business rules like discount calculations, inventory checks, and shipping + validations. The application needs to handle high transaction volumes while + maintaining data consistency across multiple database tables. + anchor_prompt: using Patterns of Enterprise Application Architecture (PEAA) + paraphrase_prompt: What architectural approach would best organize the business + logic and data access for this complex transactional system? + options: + A: Implement a microservices architecture with each service handling a single + business capability, using REST APIs for communication and eventual consistency + for data synchronization. + B: Use a Domain Model pattern for complex business logic with a Data Mapper + pattern for persistence, complemented by a Unit of Work pattern to manage + transactions across multiple entities. + C: Create a single monolithic service with stored procedures handling all business + logic in the database layer, using direct SQL calls from the presentation + tier. + D: Build a reactive event-driven system using CQRS with separate read and write + models, implementing event sourcing to capture all state changes as immutable + events. + correct: B diff --git a/evaluations/specs/gherkin.yaml b/evaluations/specs/gherkin.yaml new file mode 100644 index 0000000..171f997 --- /dev/null +++ b/evaluations/specs/gherkin.yaml @@ -0,0 +1,40 @@ +anchor: gherkin +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "Gherkin"? + + ' + options: + A: A domain-specific language for writing human-readable executable specifications + using Given/When/Then keywords in behavior-driven development + B: A software testing methodology that emphasizes writing failing tests first, + then implementing the minimum code to make them pass + C: A requirements elicitation technique using targeted questions to uncover + assumptions and clarify stakeholder needs + D: A structured format for documenting acceptance criteria using natural language + templates with numbered steps and expected outcomes + correct: A + application: + scenario: Your team is developing an e-commerce checkout system where business + analysts need to specify payment validation rules that developers can implement + and testers can verify. The product owner wants to ensure that both successful + payments and various error conditions are properly handled, and all stakeholders + need to understand the expected behavior. + anchor_prompt: using Gherkin + paraphrase_prompt: structure the payment validation specifications so they can + serve as both human-readable documentation and automated test cases + options: + A: Write detailed technical specifications in confluence with UML diagrams showing + payment flow states, then create separate unit tests with mock payment gateways + to verify each validation rule independently. + B: Create feature files with scenarios that use Given-When-Then steps to describe + payment contexts, user actions, and expected outcomes, organizing related + scenarios under payment validation features with shared background steps. + C: Document payment rules in user story format with acceptance criteria bullets, + then implement integration tests that call actual payment APIs to validate + the complete payment processing workflow. + D: Build a requirements traceability matrix linking business rules to test cases, + then write automated UI tests that simulate user interactions with the checkout + form to verify payment validation behavior. + correct: B diff --git a/evaluations/specs/github-flow.yaml b/evaluations/specs/github-flow.yaml new file mode 100644 index 0000000..9560f78 --- /dev/null +++ b/evaluations/specs/github-flow.yaml @@ -0,0 +1,37 @@ +anchor: github-flow +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "GitHub Flow"? + + ' + options: + A: A semantic versioning system that automatically increments version numbers + based on commit message patterns and release types. + B: Workflow steps + C: A standardized format for writing commit messages that includes type, scope, + and description to improve project history readability. + D: A distributed version control architecture that enables multiple developers + to work on separate repositories while maintaining code synchronization. + correct: B + application: + scenario: Your team is working on a web application that gets deployed to production + multiple times per day. A critical bug has been reported by users, and you need + to implement a hotfix while ensuring the main branch remains stable and deployable. + The team follows a branch-based workflow where every change goes through code + review. + anchor_prompt: using GitHub Flow + paraphrase_prompt: What approach should you take to implement and deploy this + hotfix while maintaining continuous delivery practices? + options: + A: Create a hotfix branch from the latest release tag, implement the fix, merge + it back to both the release branch and main, then deploy from the release + branch + B: Create a hotfix branch from main, implement the fix, open a pull request + for code review, merge to main after approval, then deploy immediately from + main + C: Implement the fix directly on main branch, commit the changes, run tests + locally, then push and deploy if tests pass + D: Create a hotfix branch from main, implement the fix, merge it to a staging + branch for testing, then merge from staging to main after validation + correct: B diff --git a/evaluations/specs/gutes-deutsch-wolf-schneider.yaml b/evaluations/specs/gutes-deutsch-wolf-schneider.yaml new file mode 100644 index 0000000..c6e4387 --- /dev/null +++ b/evaluations/specs/gutes-deutsch-wolf-schneider.yaml @@ -0,0 +1,44 @@ +anchor: gutes-deutsch-wolf-schneider +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "Gutes Deutsch nach Wolf Schneider"? + + ' + options: + A: Structure information hierarchically with the most important conclusion first, + followed by supporting arguments grouped logically — each section should build + upon the previous one; use consistent formatting throughout. + B: Prefer short, direct sentences over long, complex ones — every sentence should + express one idea; use active constructions; avoid passive voice and impersonal + constructions wherever possible + C: Begin with the bottom line up front, presenting key findings immediately + — organize supporting details in order of decreasing importance; eliminate + unnecessary background information and focus on actionable insights. + D: Apply modular design principles where each component serves a single responsibility + — minimize dependencies between modules; favor composition over inheritance + and maintain loose coupling throughout the system architecture. + correct: B + application: + scenario: A German software company is revising their user documentation after + receiving complaints that it's difficult to understand. The current version + contains many long sentences with multiple clauses, passive constructions, and + abstract technical jargon. The technical writing team needs to rewrite a key + section explaining how users can configure system settings. + anchor_prompt: using Gutes Deutsch nach Wolf Schneider + paraphrase_prompt: to create clear, accessible German prose that prioritizes reader + comprehension and eliminates unnecessary complexity + options: + A: Use sophisticated vocabulary and complex sentence structures to demonstrate + technical expertise, incorporate industry-standard terminology throughout, + and maintain formal passive voice constructions to convey professional authority. + B: Write short, direct sentences with active voice, replace abstract noun phrases + with concrete verbs, eliminate filler words, and choose familiar terms over + technical jargon when both convey the same meaning. + C: Focus on comprehensive coverage by including detailed explanations for every + possible scenario, use subordinate clauses to show relationships between concepts, + and employ precise technical terminology for accuracy. + D: Structure content using bullet points and numbered lists exclusively, maintain + consistent sentence length throughout the document, and include extensive + cross-references to related technical specifications and standards. + correct: B diff --git a/evaluations/specs/hexagonal-architecture.yaml b/evaluations/specs/hexagonal-architecture.yaml new file mode 100644 index 0000000..085e3fb --- /dev/null +++ b/evaluations/specs/hexagonal-architecture.yaml @@ -0,0 +1,41 @@ +anchor: hexagonal-architecture +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "Hexagonal Architecture (Ports + & Adapters)"? + + ' + options: + A: Six-layered application structure where each layer handles specific responsibilities + and communicates only with adjacent layers + B: Core domain at the center, isolated from external concerns; interfaces defining + how the application communicates + C: Database design pattern using six normalized tables with adapter classes + to handle object-relational mapping between entities + D: Microservices architecture pattern where six independent services communicate + through standardized API ports and message adapters + correct: B + application: + scenario: Your team is building a payment processing service that needs to support + multiple payment gateways (Stripe, PayPal, Square), handle requests from both + a web API and a mobile SDK, store transaction data in PostgreSQL, and send notifications + via email and SMS. The business requirements are complex but well-defined, while + the specific technologies may change over time. + anchor_prompt: using Hexagonal Architecture (Ports & Adapters) + paraphrase_prompt: How would you structure this system to maximize testability, + technology independence, and the ability to easily swap external integrations? + options: + A: Create a layered architecture with separate presentation, business logic, + and data access layers, using dependency injection to manage connections between + payment gateways and notification services. + B: Place payment processing domain logic at the center, define port interfaces + for payment gateways and notifications, then implement adapters for each external + service, ensuring all dependencies point inward to the core domain. + C: Build a microservices architecture with separate services for each payment + gateway, a central orchestrator service, and shared databases to maintain + consistency across all payment operations. + D: Implement a plugin-based architecture where each payment gateway and notification + method is a plugin, with a central registry managing plugin lifecycle and + a shared event bus for communication. + correct: B diff --git a/evaluations/specs/iec-61508-sil-levels.yaml b/evaluations/specs/iec-61508-sil-levels.yaml new file mode 100644 index 0000000..72bd587 --- /dev/null +++ b/evaluations/specs/iec-61508-sil-levels.yaml @@ -0,0 +1,35 @@ +anchor: iec-61508-sil-levels +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "IEC 61508 SIL Levels"? + + ' + options: + A: 'Three Quality Assurance Levels; : 10^-3^ ≤ pfd < 10^-2^ (acceptable defect + density)' + B: 'Four Safety Integrity Levels; : 10^-2^ ≤ pfd < 10^-1^ (tolerable risk reduction)' + C: 'Five Reliability Assessment Levels; : 10^-4^ ≤ pfd < 10^-3^ (minimum performance + threshold)' + D: 'Six Verification Testing Levels; : 10^-1^ ≤ pfd < 10^0^ (standard compliance + range)' + correct: B + application: + scenario: Your team is developing a safety instrumented system for a chemical + processing plant that must prevent overpressure conditions. The hazard analysis + indicates that failure of this safety function could result in equipment damage + and potential worker injury, with a tolerable risk requiring the safety system + to have a probability of failure on demand between 10^-3 and 10^-2. + anchor_prompt: using IEC 61508 SIL Levels + paraphrase_prompt: determine the appropriate safety integrity classification and + corresponding development requirements for this safety function + options: + A: Classify as SIL 1, implement basic software development practices with minimal + verification requirements and simple hardware architecture constraints + B: Classify as SIL 2, implement structured software development methods with + moderate verification requirements and hardware fault tolerance measures + C: Classify as SIL 3, implement rigorous software development processes with + extensive verification and validation plus high hardware fault tolerance + D: Classify as SIL 4, implement the most stringent development processes with + maximum verification requirements and highest level hardware redundancy + correct: B diff --git a/evaluations/specs/impact-mapping.yaml b/evaluations/specs/impact-mapping.yaml new file mode 100644 index 0000000..046474e --- /dev/null +++ b/evaluations/specs/impact-mapping.yaml @@ -0,0 +1,39 @@ +anchor: impact-mapping +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "Impact Mapping"? + + ' + options: + A: User → Stories → Epics → Features; requirement decomposition methodology + focusing on user needs and system capabilities + B: Goal → Actors → Impacts → Deliverables; business objective (why?) + C: Problem → Analysis → Design → Implementation; systematic approach to software + development through structured phase transitions + D: Stakeholder → Requirements → Architecture → Code; traceability framework + linking business needs to technical implementation details + correct: B + application: + scenario: Your e-commerce platform team has been asked to increase customer retention + by 15% over the next six months. The stakeholders have different opinions on + what features to build, with marketing wanting loyalty programs, engineering + suggesting performance improvements, and customer service pushing for better + support tools. + anchor_prompt: using Impact Mapping + paraphrase_prompt: to create a goal-oriented plan that connects business outcomes + to specific deliverables while identifying key stakeholders + options: + A: Create a feature prioritization matrix ranking loyalty programs, performance + improvements, and support tools based on development effort and expected customer + impact scores. + B: Map the retention goal to key actors (existing customers, support agents, + marketing team), identify how their behaviors need to change, then determine + what deliverables enable those behavior changes. + C: Conduct user story mapping sessions with all stakeholders to create a shared + product backlog organized by customer journey stages and prioritized by business + value. + D: Develop a roadmap showing three parallel workstreams for loyalty features, + performance optimization, and support enhancements with clear milestones and + dependencies. + correct: B diff --git a/evaluations/specs/invest.yaml b/evaluations/specs/invest.yaml new file mode 100644 index 0000000..8aea817 --- /dev/null +++ b/evaluations/specs/invest.yaml @@ -0,0 +1,45 @@ +anchor: invest +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "INVEST"? + + ' + options: + A: Stories should be prioritized using Must have, Should have, Could have, and + Won't have categories to ensure critical requirements are delivered first; + stakeholders rank features by business value and implementation complexity + to guide sprint planning and resource allocation. + B: Stories should be self-contained and deliverable in any order; avoid dependencies + between stories that force a fixed implementation sequence; stories are not + contracts; the details are open to discussion between team and stakeholders + until they enter a sprint + C: Stories should be mapped chronologically along a user journey timeline with + supporting tasks underneath; teams visualize the complete user experience + to identify gaps, prioritize releases, and maintain focus on delivering end-to-end + value through iterative development cycles. + D: Stories should follow a standardized template with acceptance criteria, definition + of done, and effort estimates; teams must complete detailed analysis and obtain + formal approval from product owners before any story can be moved into development + or testing phases. + correct: B + application: + scenario: 'Your team is reviewing user stories during backlog refinement for an + e-commerce platform. One story reads: ''As a customer, I want the system to + be faster and more secure so that I have a better experience.'' The team is + struggling to estimate this story and cannot determine what ''done'' looks like.' + anchor_prompt: using INVEST + paraphrase_prompt: What should you do to make this story ready for sprint planning? + options: + A: Add more detailed technical specifications about performance benchmarks and + security protocols, then assign it to the most senior developer who can handle + the complexity + B: Split this into multiple smaller stories with specific acceptance criteria, + such as 'reduce page load time to under 2 seconds' and 'implement two-factor + authentication for login' + C: Move the story to the next release cycle and create a technical spike to + research all possible performance and security improvements before writing + any user stories + D: Keep the story as-is but add story points based on the team's gut feeling, + since users clearly value performance and security improvements + correct: B diff --git a/evaluations/specs/iso-25010.yaml b/evaluations/specs/iso-25010.yaml new file mode 100644 index 0000000..2cd2dcb --- /dev/null +++ b/evaluations/specs/iso-25010.yaml @@ -0,0 +1,51 @@ +anchor: iso-25010 +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "ISO/IEC 25010"? + + ' + options: + A: A comprehensive framework for software architecture evaluation that provides + systematic methods to assess quality attributes through scenario-based analysis, + stakeholder workshops, and risk identification processes to determine architectural + trade-offs and decisions. + B: 'Eight top-level quality characteristics that describe the quality of a software + product: Functional Suitability, Performance Efficiency, Compatibility, Usability, + Reliability, Security, Maintainability, and Portability; the degree to which + the product provides functions that meet stated and implied needs — sub-characteristics: + functional completeness, functional correctness, functional appropriateness' + C: A standardized template structure consisting of twelve sections for documenting + software architecture decisions including context, functional requirements, + building blocks, runtime views, deployment views, and architectural decisions + with their rationales and consequences. + D: A lightweight methodology for capturing and communicating architectural decisions + through structured records that document the title, status, context, decision + rationale, and consequences of significant architectural choices made during + software development projects. + correct: B + application: + scenario: Your development team is building a new mobile banking application and + needs to establish quality requirements for the project. The product owner has + expressed concerns about user satisfaction, system downtime, and data protection, + but the requirements are currently vague and unmeasurable. The team needs a + structured approach to define specific, testable quality criteria that align + with industry standards. + anchor_prompt: using ISO/IEC 25010 + paraphrase_prompt: to establish comprehensive, measurable quality requirements + that cover all critical aspects of software quality + options: + A: Focus primarily on functional requirements and add basic performance benchmarks, + security protocols, and user acceptance criteria as secondary considerations + to be refined during testing phases. + B: 'Define specific measurable requirements across the eight quality characteristics: + functional suitability, performance efficiency, compatibility, usability, + reliability, security, maintainability, and portability, with concrete sub-characteristics + for each.' + C: Create a custom quality framework based on stakeholder interviews, competitor + analysis, and industry best practices, then map these findings to relevant + testing strategies and acceptance criteria. + D: Establish quality gates focused on code coverage, automated testing results, + security scan outcomes, and user story completion rates to ensure comprehensive + quality measurement throughout development. + correct: B diff --git a/evaluations/specs/jobs-to-be-done.yaml b/evaluations/specs/jobs-to-be-done.yaml new file mode 100644 index 0000000..f659523 --- /dev/null +++ b/evaluations/specs/jobs-to-be-done.yaml @@ -0,0 +1,37 @@ +anchor: jobs-to-be-done +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "Jobs To Be Done (JTBD)"? + + ' + options: + A: A systematic approach to breaking down complex projects into smaller, manageable + tasks with clear deliverables + B: Progress people want to make in a particular context; practical task to accomplish + C: A framework for defining user personas and their specific roles within an + organization or system + D: A methodology for mapping employee responsibilities to business objectives + and performance metrics + correct: B + application: + scenario: A fitness app company is struggling with low user retention despite + having comprehensive features like workout tracking, nutrition logging, and + social sharing. Users download the app but stop using it within two weeks. The + product team needs to understand why users aren't sticking with their solution. + anchor_prompt: using Jobs To Be Done (JTBD) + paraphrase_prompt: What approach should the product team take to understand the + underlying reasons users seek fitness solutions and why they abandon the current + app? + options: + A: Conduct user surveys asking about preferred features, UI design feedback, + and demographic information to identify which user segments need different + functionality + B: Interview users about the specific circumstances that led them to seek a + fitness solution, what progress they were trying to make, and what they hired + instead when they stopped using the app + C: Analyze competitor apps to identify missing features and benchmark against + industry best practices for user engagement and retention metrics + D: Create detailed user personas based on age, fitness level, and lifestyle + to design targeted features for each segment and improve onboarding flows + correct: B diff --git a/evaluations/specs/lasr.yaml b/evaluations/specs/lasr.yaml new file mode 100644 index 0000000..f91be6f --- /dev/null +++ b/evaluations/specs/lasr.yaml @@ -0,0 +1,44 @@ +anchor: lasr +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "LASR according to Toth/Zörner"? + + ' + options: + A: Lightweight Architecture Decision Records that capture the context, decision, + and consequences of significant architectural choices using a standardized + template format for documentation and communication purposes + B: High-level description of how the solution addresses the most important quality + requirements and constraints; the central architectural ideas that shape the + system; key structural and runtime views showing the main building blocks, + their responsibilities, and how they interact at runtime + C: Low-level technical specification that defines implementation details, coding + standards, and deployment procedures; focuses on concrete technology choices + and step-by-step guidance for development teams + D: Hierarchical visual modeling technique using context, container, component, + and code diagrams to represent software architecture at different levels of + abstraction for stakeholder communication and system understanding + correct: B + application: + scenario: Your team has just completed the initial architecture design for a new + e-commerce platform that must handle high traffic loads and integrate with multiple + payment providers. The product owner and development teams need to understand + the key architectural decisions before implementation begins. + anchor_prompt: using LASR according to Toth/Zörner + paraphrase_prompt: What should you focus on when creating a lightweight architecture + document that effectively communicates the essential architectural information + to stakeholders? + options: + A: Document the complete system context, detailed component specifications, + comprehensive deployment views, and full traceability matrices to ensure nothing + is missed. + B: Focus on the core solution strategy for handling traffic loads, key structural + components and their interactions, critical interfaces with payment providers, + and identified scalability risks with mitigation plans. + C: Create detailed use case diagrams, complete database schemas, exhaustive + API documentation, and comprehensive testing strategies to cover all system + aspects. + D: Emphasize stakeholder concerns, detailed quality scenarios, complete constraint + documentation, and thorough architectural decision records with full rationale. + correct: B diff --git a/evaluations/specs/linddun.yaml b/evaluations/specs/linddun.yaml new file mode 100644 index 0000000..c5e95a6 --- /dev/null +++ b/evaluations/specs/linddun.yaml @@ -0,0 +1,39 @@ +anchor: linddun +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "LINDDUN"? + + ' + options: + A: A security threat modeling framework that categorizes threats into Spoofing, + Tampering, Repudiation, Information Disclosure, Denial of Service, and Elevation + of Privilege + B: A privacy threat modeling framework that systematically identifies threats + like Linkability, Identifiability, Non-repudiation, Detectability, Disclosure + of information, Unawareness, and Non-compliance + C: A data protection compliance checklist for ensuring GDPR conformity through + technical and organizational measures + D: A risk assessment methodology that evaluates security controls against a + catalog of known vulnerabilities and attack patterns + correct: B + application: + scenario: A healthcare startup is developing a telemedicine platform that collects + patient medical records, processes video consultations, and shares data with + insurance providers. The development team needs to systematically identify potential + privacy risks before the platform launches to ensure GDPR compliance. + anchor_prompt: using LINDDUN + paraphrase_prompt: What systematic approach should the team take to comprehensively + identify and categorize privacy threats across all aspects of their data handling + practices? + options: + A: Focus primarily on encryption and access controls, then conduct penetration + testing to identify vulnerabilities in data transmission and storage mechanisms. + B: 'Systematically analyze the system against seven privacy threat categories: + Linkability, Identifiability, Non-repudiation, Detectability, Disclosure, + Unawareness, and Non-compliance to identify specific privacy risks.' + C: Implement a risk assessment matrix focusing on data classification, user + authentication, and regulatory compliance requirements across different jurisdictions. + D: Conduct stakeholder interviews to understand privacy concerns, then map data + flows and apply general security threat modeling techniques to identify risks. + correct: B diff --git a/evaluations/specs/llm-evaluations.yaml b/evaluations/specs/llm-evaluations.yaml new file mode 100644 index 0000000..40c4f47 --- /dev/null +++ b/evaluations/specs/llm-evaluations.yaml @@ -0,0 +1,44 @@ +anchor: llm-evaluations +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "LLM-Evaluations"? + + ' + options: + A: Automated testing frameworks that generate adversarial prompts and edge cases + to identify failure modes in language models through mutation-based prompt + engineering and stress testing methodologies. + B: Standardized datasets and tasks used to compare LLM capabilities — MMLU (Massive + Multitask Language Understanding), HellaSwag, HumanEval, BIG-Bench, GSM8K, + TruthfulQA, ARC; quantitative measures of model quality — perplexity, accuracy, + bleu, rouge, f1, pass@k (code generation), exact match, calibration + C: Architectural design patterns and best practices for deploying large language + models in production environments, including load balancing, caching strategies, + model versioning, and API gateway configurations. + D: Chain-of-thought reasoning techniques that enable language models to break + down complex problems into step-by-step logical processes, improving performance + on mathematical and analytical tasks through structured prompting. + correct: B + application: + scenario: Your team has developed a new code generation LLM and needs to compare + its performance against existing models like GPT-4 and Claude before deciding + whether to deploy it in production. The model will be used for generating Python + functions from natural language descriptions in your company's development workflow. + anchor_prompt: using LLM-Evaluations + paraphrase_prompt: What systematic approach should you take to rigorously assess + and compare your model's capabilities against established alternatives? + options: + A: Run your model on a few hand-selected coding problems, measure basic accuracy, + and compare the results against published performance numbers from other models' + documentation. + B: Evaluate on standardized benchmarks like HumanEval using pass@k metrics, + conduct human preference comparisons for code quality, and test for potential + training data contamination across multiple coding tasks. + C: Deploy all models to a staging environment, collect user feedback over several + weeks, and choose the model that receives the highest average satisfaction + ratings from developers. + D: Measure inference speed and memory usage across different model sizes, then + select the model that provides the best performance-to-cost ratio for your + infrastructure requirements. + correct: B diff --git a/evaluations/specs/madr.yaml b/evaluations/specs/madr.yaml new file mode 100644 index 0000000..3f574a0 --- /dev/null +++ b/evaluations/specs/madr.yaml @@ -0,0 +1,37 @@ +anchor: madr +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "MADR"? + + ' + options: + A: Agile methodology for iterative decision-making processes; collaborative + framework + B: Well-defined format with specific sections; standard fields + C: Software architecture pattern for microservices decomposition; modular design + approach + D: Risk assessment framework for technical debt management; evaluation methodology + correct: B + application: + scenario: Your team is evaluating three different caching strategies (Redis, Memcached, + or in-memory caching) for a high-traffic e-commerce platform. The decision will + significantly impact performance, scalability, and operational complexity. You + need to document this architectural decision for future reference and stakeholder + review. + anchor_prompt: using MADR + paraphrase_prompt: document this architectural decision with a structured approach + that explicitly captures all evaluated alternatives and their trade-offs + options: + A: Create a simple decision log entry with the chosen option (Redis) and a brief + rationale, then store it in the project wiki for easy access and updates. + B: Document the decision with sections for context, decision drivers, all three + caching options as considered alternatives, pros/cons analysis for each, and + the final outcome with justification. + C: Write a comprehensive technical specification document detailing the implementation + approach for Redis, including configuration parameters, monitoring setup, + and deployment procedures. + D: Record the decision in a structured format focusing primarily on the chosen + solution's benefits and implementation details, with minimal coverage of rejected + alternatives. + correct: B diff --git a/evaluations/specs/mece.yaml b/evaluations/specs/mece.yaml new file mode 100644 index 0000000..ece104e --- /dev/null +++ b/evaluations/specs/mece.yaml @@ -0,0 +1,33 @@ +anchor: mece +tier: 3 +questions: + recognition: + question: 'Which of the following best describes the "MECE Principle"? + + ' + options: + A: Prioritizing requirements into Must/Should/Could/Won't categories + B: Structuring categories so they do not overlap and collectively cover all + possibilities + C: Presenting the conclusion first, then organizing supporting arguments hierarchically + D: Decomposing work into independent, negotiable, and testable user stories + correct: B + application: + scenario: Your team is designing a new e-commerce platform and needs to organize + the main functional areas into microservices. The platform must handle user + management, product catalog, shopping cart, order processing, payment handling, + inventory tracking, and customer support features. + anchor_prompt: using MECE Principle + paraphrase_prompt: How should you organize these functional areas to ensure complete + coverage with no overlapping responsibilities between services? + options: + A: 'Group by user-facing vs backend services: (User Management, Product Catalog, + Shopping Cart) and (Order Processing, Payment, Inventory, Support)' + B: 'Organize by business capability: User Service, Catalog Service, Cart Service, + Order Service, Payment Service, Inventory Service, Support Service' + C: 'Structure by data access patterns: Read-heavy services (Catalog, Support) + and Write-heavy services (User, Cart, Order, Payment, Inventory)' + D: 'Arrange by development team expertise: Core services (User, Product, Cart), + Transaction services (Order, Payment), and Operations services (Inventory, + Support)' + correct: B diff --git a/evaluations/specs/morphological-box.yaml b/evaluations/specs/morphological-box.yaml new file mode 100644 index 0000000..d6b3728 --- /dev/null +++ b/evaluations/specs/morphological-box.yaml @@ -0,0 +1,39 @@ +anchor: morphological-box +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "Morphological Box"? + + ' + options: + A: Systematically evaluate and score alternative solutions against weighted + criteria using a decision matrix approach + B: Break complex problem into independent parameters/dimensions; identify possible + values/options for each parameter + C: Create mutually exclusive and collectively exhaustive categories to ensure + complete problem space coverage without overlap + D: Generate creative solutions by combining random elements from different domains + through structured brainstorming techniques + correct: B + application: + scenario: 'Your team is designing a new API gateway solution and needs to explore + all possible architectural combinations. There are multiple independent dimensions + to consider: authentication methods (OAuth2, JWT, API keys, mTLS), rate limiting + strategies (token bucket, sliding window, fixed window), storage backends (Redis, + PostgreSQL, DynamoDB), and deployment models (containerized, serverless, VM-based).' + anchor_prompt: using Morphological Box + paraphrase_prompt: What systematic approach should you take to ensure you've considered + all viable architectural combinations before making design decisions? + options: + A: Focus on the most critical dimension first, select the best option for that + dimension, then optimize the remaining dimensions around that choice to reduce + complexity + B: Create a matrix with each dimension as a column, list all possible options + for each dimension as rows, then systematically evaluate combinations while + filtering out infeasible ones + C: Conduct stakeholder interviews to determine preferences for each dimension, + then use weighted scoring to rank the top three combinations based on business + priorities + D: Research industry best practices for each dimension independently, then combine + the most popular choices from each category to create a proven solution + correct: B diff --git a/evaluations/specs/moscow.yaml b/evaluations/specs/moscow.yaml new file mode 100644 index 0000000..9940477 --- /dev/null +++ b/evaluations/specs/moscow.yaml @@ -0,0 +1,44 @@ +anchor: moscow +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "MoSCoW"? + + ' + options: + A: A visual mapping technique that organizes user stories chronologically to + identify gaps and prioritize features based on user journey stages and business + value delivery. + B: Non-negotiable requirements essential for the current delivery; without them + the solution is unusable or unsafe; important requirements that are not vital; + painful to leave out but the solution is still viable without them + C: A collaborative workshop method for creating shared understanding of project + scope by mapping stakeholders, impacts, and deliverables against strategic + business objectives. + D: A risk assessment framework that categorizes project uncertainties into severity + levels to determine mitigation strategies and contingency planning approaches + for delivery teams. + correct: B + application: + scenario: Your agile team has 15 user stories estimated at 120 story points for + the next 3-week sprint, but your velocity is only 80 points. The product owner + needs to decide which stories to include while ensuring stakeholders understand + what won't be delivered. + anchor_prompt: using MoSCoW + paraphrase_prompt: How should you categorize and communicate the stories to stakeholders + for this sprint? + options: + A: Rank all 15 stories from 1-15 by business value, select the top-ranked stories + that fit within 80 points, and inform stakeholders that lower-ranked items + are deferred to future sprints. + B: Categorize stories into Must have (critical for sprint goal), Should have + (important but not essential), Could have (nice to have), and Won't have this + sprint (explicitly out of scope), then select from each category to fit 80 + points. + C: Group stories by feature area, estimate the effort for each group, select + complete feature groups that fit within 80 points, and communicate to stakeholders + which feature areas are postponed. + D: Sort stories by technical complexity and business impact using a 2x2 matrix, + prioritize high-impact low-complexity items first, and explain to stakeholders + which quadrants won't be addressed this sprint. + correct: B diff --git a/evaluations/specs/mutation-testing.yaml b/evaluations/specs/mutation-testing.yaml new file mode 100644 index 0000000..d90898b --- /dev/null +++ b/evaluations/specs/mutation-testing.yaml @@ -0,0 +1,35 @@ +anchor: mutation-testing +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "Mutation Testing"? + + ' + options: + A: Analyze genetic algorithms and evolutionary programming techniques to optimize + software performance through iterative code modifications + B: Evaluate how effective tests are at detecting bugs; deliberately introduce + small, syntactic changes (mutants) into source code + C: Test software behavior under various input combinations by systematically + changing configuration parameters and environmental conditions + D: Verify code stability by introducing controlled runtime errors and monitoring + how the system handles unexpected failures + correct: B + application: + scenario: Your team has achieved 95% code coverage on a critical payment processing + module, but recent production bugs have slipped through despite the high coverage + numbers. Management is questioning whether the existing test suite is actually + effective at catching defects. + anchor_prompt: using Mutation Testing + paraphrase_prompt: to evaluate whether your tests can actually detect bugs rather + than just execute code + options: + A: Add more unit tests to increase code coverage to 100% and focus on testing + edge cases that weren't previously covered + B: Introduce small deliberate code changes like flipping operators or removing + statements, then run your test suite to see which changes go undetected + C: Perform static code analysis to identify potential vulnerabilities and code + smells that could lead to production defects + D: Implement property-based testing to generate random inputs and verify that + system invariants hold across all scenarios + correct: B diff --git a/evaluations/specs/negative-control.yaml b/evaluations/specs/negative-control.yaml new file mode 100644 index 0000000..b7664a2 --- /dev/null +++ b/evaluations/specs/negative-control.yaml @@ -0,0 +1,17 @@ +anchor: negative-control +tier: null + +# Negative control: a fabricated anchor that does not exist. +# Tests whether the model distinguishes real anchors from invented ones. +# Correct answer is D ("not an established methodology"). + +questions: + recognition: + question: | + "Müller's Inversion Principle" is: + options: + A: "A testing methodology that reverses test execution order to detect hidden state dependencies" + B: "An architecture pattern that inverts control flow between framework and application code" + C: "A requirements technique that starts from the desired outcome and works backward to inputs" + D: "Not an established methodology — this term does not refer to a recognized concept" + correct: D diff --git a/evaluations/specs/nelson-rules.yaml b/evaluations/specs/nelson-rules.yaml new file mode 100644 index 0000000..ccd9bea --- /dev/null +++ b/evaluations/specs/nelson-rules.yaml @@ -0,0 +1,36 @@ +anchor: nelson-rules +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "Nelson Rules"? + + ' + options: + A: Two consecutive points beyond 2σ (Warning); 7 alternating points above and + below the mean (oscillation) + B: One point beyond 3σ (Outlier); 9 consecutive points on the same side of the + mean (shift/bias) + C: Four points in a row increasing or decreasing (Trend); 6 consecutive points + within 1σ (Clustering) + D: Eight points beyond 1σ on alternating sides (Variance); 5 consecutive points + forming a pattern (Sequence) + correct: B + application: + scenario: Your team is monitoring API response times using a control chart with + mean=200ms and standard deviation=50ms. Over the past 15 data points, you notice + that 14 consecutive measurements have been alternating between values above + and below the mean (e.g., 220ms, 180ms, 230ms, 170ms, etc.), creating a zigzag + pattern. + anchor_prompt: using Nelson Rules + paraphrase_prompt: to systematically detect whether this alternating pattern indicates + a non-random process issue that requires investigation + options: + A: Focus only on whether any individual points exceed 3 standard deviations + from the mean, as this is the most reliable indicator of process problems + B: Flag this as a special cause violation since 14 alternating points up and + down indicates systematic oscillation that suggests an assignable cause + C: Wait for at least 9 consecutive points on the same side of the mean before + concluding there is a process shift requiring attention + D: Check if 4 out of 5 consecutive points fall beyond 1 standard deviation on + the same side before taking corrective action + correct: B diff --git a/evaluations/specs/owasp-top-10.yaml b/evaluations/specs/owasp-top-10.yaml new file mode 100644 index 0000000..9d8c85a --- /dev/null +++ b/evaluations/specs/owasp-top-10.yaml @@ -0,0 +1,42 @@ +anchor: owasp-top-10 +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "OWASP Top 10"? + + ' + options: + A: A regularly updated list of the ten most critical web application security + risks, covering threats like injection, broken access control, and cryptographic + failures + B: A comprehensive framework for assessing software reliability and safety integrity + levels in critical systems with four distinct SIL classifications + C: An industry standard methodology for evaluating cybersecurity maturity across + organizational processes with five progressive capability levels + D: A systematic approach to privacy threat modeling that identifies linkability, + identifiability, and disclosure risks in software systems + correct: A + application: + scenario: Your team is conducting a security review of a web application that + handles customer financial data. During the assessment, you discover that user + input from web forms is directly concatenated into SQL queries without validation, + the application uses default database credentials, and sensitive customer data + is stored in plain text. The development team needs to prioritize which security + issues to address first. + anchor_prompt: using OWASP Top 10 + paraphrase_prompt: prioritize these security vulnerabilities based on established + web application security risk categories + options: + A: Focus on the plain text storage issue first since data encryption is the + foundation of all security, then address input validation, and finally update + default credentials during the next major release cycle. + B: Address the SQL injection vulnerability first (A03 - Injection), then fix + the plain text storage (A02 - Cryptographic Failures), and finally remediate + the default credentials (A05 - Security Misconfiguration). + C: Implement comprehensive logging and monitoring capabilities first to detect + future attacks, then gradually address the technical vulnerabilities based + on development team availability and sprint capacity. + D: Prioritize fixing the default credentials first since they provide the easiest + attack vector, then address data encryption, and finally implement input validation + as a long-term security enhancement. + correct: B diff --git a/evaluations/specs/plain-english-strunk-white.yaml b/evaluations/specs/plain-english-strunk-white.yaml new file mode 100644 index 0000000..812b902 --- /dev/null +++ b/evaluations/specs/plain-english-strunk-white.yaml @@ -0,0 +1,50 @@ +anchor: plain-english-strunk-white +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "Plain English according to Strunk + & White"? + + ' + options: + A: Write with a clear hierarchical structure where conclusions come first followed + by supporting arguments, using logical groupings that allow readers to understand + the main point before diving into details — start with the answer, then provide + the reasoning that led to that conclusion. + B: Every word in a sentence should serve a purpose; cut words that add bulk + without adding meaning — "the fact that" → "that", "owing to the fact that" + → "since"; prefer active constructions over passive; active voice is more + direct, vigorous, and concise — "the dog bit the man" not "the man was bitten + by the dog" + C: Use simple, everyday vocabulary and short sentences that can be understood + by the general public, avoiding technical jargon, complex grammatical structures, + and industry-specific terminology — choose 'help' over 'facilitate', 'use' + over 'utilize', and 'show' over 'demonstrate'. + D: Organize information in a bottom-line-up-front approach where the most important + message appears at the beginning, followed by supporting details arranged + in decreasing order of importance — present key findings first, then provide + the analysis and background that supports those conclusions. + correct: B + application: + scenario: A software engineer is reviewing API documentation that describes error + handling procedures. The current draft contains several sentences that feel + wordy and unclear, making it difficult for developers to quickly understand + what actions to take when errors occur. + anchor_prompt: using Plain English according to Strunk & White + paraphrase_prompt: to make the error handling documentation as clear, direct, + and concise as possible for developer users + options: + A: Add more detailed explanations and qualifying phrases like 'generally speaking' + and 'in most cases' to ensure developers understand the nuanced conditions + under which different error handling approaches might be considered appropriate. + B: Remove unnecessary words, use active voice, and place the most important + action at the end of each sentence. Replace 'In the event that an error occurs' + with 'When an error occurs' and 'The system will be restarted by the administrator' + with 'The administrator restarts the system.' + C: Include comprehensive background context about why each error might occur, + using rich descriptive language and multiple adjectives to paint a complete + picture of potential system states and failure modes. + D: Restructure sentences to use passive voice consistently, add transitional + phrases between concepts, and include apologetic language like 'unfortunately' + and 'regrettably' to acknowledge the inconvenience of errors. + correct: B diff --git a/evaluations/specs/prd.yaml b/evaluations/specs/prd.yaml new file mode 100644 index 0000000..d25c7ce --- /dev/null +++ b/evaluations/specs/prd.yaml @@ -0,0 +1,41 @@ +anchor: prd +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "PRD"? + + ' + options: + A: Systematic prioritization framework using Must have, Should have, Could have, + and Won't have categories to rank feature requirements based on business value + and urgency + B: Clear articulation of the problem to be solved and the target users; measurable + outcomes that define what "done" looks like (kpis, okrs) + C: Visual representation of user activities and tasks arranged chronologically + to identify gaps, priorities, and release planning opportunities for product + development + D: Structured template format capturing user needs as role-based scenarios with + acceptance criteria to ensure requirements are testable and implementable + by development teams + correct: B + application: + scenario: Your startup is building a new mobile app for freelance project management. + The engineering team keeps asking clarifying questions about features, the design + team is unsure about user workflows, and marketing needs to understand the target + audience. Stakeholders have conflicting ideas about what should be included + in the first release. + anchor_prompt: using PRD + paraphrase_prompt: What document should you create to align all teams and provide + a comprehensive foundation for product development? + options: + A: Create a technical architecture document that outlines the system components, + database schema, and API specifications to guide the engineering team's implementation + decisions. + B: Write a comprehensive document that defines the problem statement, target + users, success metrics, functional requirements, scope boundaries, and constraints + to align all stakeholders. + C: Develop a project timeline with detailed user stories, acceptance criteria, + and sprint planning to coordinate development activities across all teams. + D: Conduct stakeholder interviews and create a competitive analysis report with + market research findings to inform strategic product positioning decisions. + correct: B diff --git a/evaluations/specs/problem-space-nvc.yaml b/evaluations/specs/problem-space-nvc.yaml new file mode 100644 index 0000000..fb99de0 --- /dev/null +++ b/evaluations/specs/problem-space-nvc.yaml @@ -0,0 +1,47 @@ +anchor: problem-space-nvc +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "Nonviolent Communication (Rosenberg)"? + + ' + options: + A: A structured approach to code reviews that emphasizes constructive feedback + through specific examples and actionable suggestions rather than general criticism. + B: Concrete, objective facts without evaluation or judgment. "The deploy failed + three times this week" instead of "The deploy always fails."; emotions arising + from observations. "i feel frustrated" instead of "this is frustrating." + C: A conflict resolution methodology that focuses on identifying root causes + of team disagreements and establishing clear communication protocols between + stakeholders. + D: An agile communication framework that prioritizes transparent status updates + and eliminates ambiguous language in sprint retrospectives and daily standups. + correct: B + application: + scenario: 'A product manager sends an email to the development team: ''The search + feature is completely broken and users are complaining constantly. You developers + never test anything properly before releasing. This is unacceptable and needs + to be fixed immediately.'' The team lead wants to respond in a way that addresses + the concerns while maintaining a collaborative relationship.' + anchor_prompt: using Nonviolent Communication (Rosenberg) + paraphrase_prompt: Transform this response to follow a structured approach that + separates facts from judgments, acknowledges emotions, identifies underlying + needs, and makes specific actionable requests. + options: + A: I understand you're frustrated with the search feature. However, saying we + 'never test properly' isn't accurate - we do have testing procedures. Let's + schedule a meeting to discuss how we can improve our QA process and address + the user complaints more systematically. + B: I notice the search feature has generated 15 user complaints this week. I + feel concerned because I value delivering quality software that meets user + needs. Could we schedule a 30-minute meeting tomorrow to review the specific + issues and create an action plan together? + C: Thanks for bringing this to our attention. You're right that the search feature + has issues and we need to address them quickly. I'll have the team prioritize + this as our top bug fix and we'll provide daily updates on our progress until + it's resolved. + D: I appreciate your passion for quality, and I share your concern about user + experience. While the search feature does have problems, let's focus on solutions + rather than blame. What specific search scenarios are failing, and what would + success look like from your perspective? + correct: B diff --git a/evaluations/specs/property-based-testing.yaml b/evaluations/specs/property-based-testing.yaml new file mode 100644 index 0000000..4330600 --- /dev/null +++ b/evaluations/specs/property-based-testing.yaml @@ -0,0 +1,36 @@ +anchor: property-based-testing +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "Property-Based Testing"? + + ' + options: + A: Testing software by validating that object properties and attributes maintain + expected values throughout execution + B: Invariants that should always hold; automatic test data creation + C: A testing methodology that focuses on verifying ownership and access rights + of system resources and data + D: Unit testing approach that examines individual class properties and their + getter/setter method implementations + correct: B + application: + scenario: You're developing a financial calculator library with functions for + compound interest, loan payments, and currency conversions. The library will + be used by multiple client applications, and accuracy is critical since even + small rounding errors could accumulate into significant financial discrepancies + over time. + anchor_prompt: using Property-Based Testing + paraphrase_prompt: What testing approach would best validate that your financial + calculations maintain mathematical correctness across all possible input ranges? + options: + A: Write comprehensive unit tests covering typical financial scenarios like + 30-year mortgages, common interest rates, and standard loan amounts + B: Define mathematical invariants like 'interest calculations should be commutative' + and generate thousands of random valid inputs to verify these properties always + hold + C: Create integration tests that simulate real user workflows by testing complete + financial scenarios from input to final calculation output + D: Implement regression tests using historical financial data from previous + system versions to ensure calculations remain consistent over time + correct: B diff --git a/evaluations/specs/pyramid-principle.yaml b/evaluations/specs/pyramid-principle.yaml new file mode 100644 index 0000000..3343a5d --- /dev/null +++ b/evaluations/specs/pyramid-principle.yaml @@ -0,0 +1,42 @@ +anchor: pyramid-principle +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "Pyramid Principle according + to Barbara Minto"? + + ' + options: + A: Hierarchical software architecture pattern where base components support + higher-level modules; follows dependency inversion with abstractions at the + top layer + B: Single key message at the top of the pyramid; situation → complication → + question → answer structure for setting context + C: Project management framework organizing tasks in ascending priority levels; + uses risk assessment → resource allocation → timeline planning → execution + phases + D: Information architecture methodology structuring content from broad categories + to specific details; applies user journey mapping with navigation flow optimization + correct: B + application: + scenario: Your engineering team has discovered a critical security vulnerability + in the production system that requires immediate attention and significant resources + to fix. The CTO has requested a 10-minute presentation to the executive team + explaining the situation and recommending next steps. + anchor_prompt: using Pyramid Principle according to Barbara Minto + paraphrase_prompt: How should you structure your presentation to maximize clarity + and executive buy-in for your recommended solution? + options: + A: Start with technical details of the vulnerability, explain how it was discovered, + walk through potential attack vectors, then conclude with your recommended + fix and resource requirements. + B: Lead with your recommendation to allocate resources for immediate patching, + then explain the current security risk situation, the complications it creates + for business operations, and supporting evidence for your proposed solution. + C: Present three possible solutions with pros and cons for each, provide detailed + technical analysis of the vulnerability, then ask the executives to vote on + which approach they prefer. + D: Begin by establishing credibility through your team's security expertise, + chronologically explain how the vulnerability was discovered, detail the investigation + process, then present findings and recommendations. + correct: B diff --git a/evaluations/specs/sanity-check.yaml b/evaluations/specs/sanity-check.yaml new file mode 100644 index 0000000..c0b7c31 --- /dev/null +++ b/evaluations/specs/sanity-check.yaml @@ -0,0 +1,17 @@ +anchor: sanity-check +tier: null + +# Sanity check: none of the options is correct (the answer is 42). +# Every model MUST score 0% because it will pick a wrong option. +# If any model scores >0%, the scoring pipeline has a bug. + +questions: + recognition: + question: | + What is the Answer to the Ultimate Question of Life, the Universe, and Everything? + options: + A: "17" + B: "23" + C: "99" + D: "256" + correct: X diff --git a/evaluations/specs/semantic-versioning.yaml b/evaluations/specs/semantic-versioning.yaml new file mode 100644 index 0000000..0bc881e --- /dev/null +++ b/evaluations/specs/semantic-versioning.yaml @@ -0,0 +1,32 @@ +anchor: semantic-versioning +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "Semantic Versioning (SemVer)"? + + ' + options: + A: Version control system that automatically tracks semantic changes in code + structure and meaning across different branches + B: A versioning scheme using MAJOR.MINOR.PATCH where MAJOR signals breaking + changes, MINOR signals new features, and PATCH signals bug fixes + C: Development methodology that prioritizes meaningful variable and function + naming conventions to improve code readability + D: Documentation standard that requires detailed explanations of API functionality + and business logic for each software release + correct: B + application: + scenario: You maintain a JavaScript authentication library that currently has + version 2.3.1. You need to release an update that adds a new optional parameter + to an existing login method, includes several bug fixes for token validation, + and removes a deprecated method that was marked for removal six months ago. + anchor_prompt: using Semantic Versioning (SemVer) + paraphrase_prompt: determine the appropriate version number for this release that + properly communicates the impact of changes to library consumers + options: + A: 2.3.2 - since the new parameter is optional and doesn't break existing code + B: 3.0.0 - because removing the deprecated method constitutes a breaking change + C: 2.4.0 - to reflect the addition of new functionality with the optional parameter + D: 2.3.1-update.1 - using pre-release notation to indicate multiple types of + changes + correct: B diff --git a/evaluations/specs/socratic-method.yaml b/evaluations/specs/socratic-method.yaml new file mode 100644 index 0000000..3d21b8f --- /dev/null +++ b/evaluations/specs/socratic-method.yaml @@ -0,0 +1,35 @@ +anchor: socratic-method +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "Socratic Method"? + + ' + options: + A: Systematic approach to software development that emphasizes iterative refinement + through structured peer review and collaborative problem-solving sessions + B: Lead learners to insights through questions rather than direct instruction; + cross-examination technique to expose contradictions in beliefs + C: Teaching methodology that breaks complex problems into smaller components + and builds understanding through sequential presentation of foundational concepts + D: Architectural pattern that separates concerns by organizing code into distinct + layers with well-defined interfaces and dependency injection principles + correct: B + application: + scenario: During a code review, a senior developer notices that a junior developer + has implemented a caching solution that could cause data consistency issues + in a distributed system. The junior developer seems confident in their approach + and hasn't considered the potential problems. + anchor_prompt: using Socratic Method + paraphrase_prompt: to help the junior developer discover the potential issues + through guided inquiry rather than direct criticism + options: + A: Point out the specific data consistency problems and explain why the current + caching approach won't work in a distributed environment. + B: Ask questions like 'What happens when multiple services update the same cached + data?' and 'How does your cache handle network partitions?' + C: Suggest they research distributed caching patterns and come back with alternative + solutions before proceeding with the implementation. + D: Approve the code for now but schedule a follow-up meeting to discuss distributed + systems architecture and caching strategies. + correct: B diff --git a/evaluations/specs/sota.yaml b/evaluations/specs/sota.yaml new file mode 100644 index 0000000..0d22440 --- /dev/null +++ b/evaluations/specs/sota.yaml @@ -0,0 +1,36 @@ +anchor: sota +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "SOTA (State-of-the-Art)"? + + ' + options: + A: A standardized framework for documenting software architecture decisions + and technical specifications across development teams + B: Focus on the most current, cutting-edge methods and techniques; reference + current research papers, benchmarks, and empirical results + C: A methodology for systematic testing and validation of software systems against + predefined quality assurance benchmarks + D: An agile development approach that emphasizes iterative prototyping and continuous + integration of emerging technologies + correct: B + application: + scenario: Your team is building a new document search system for a legal firm + that needs to handle complex queries across millions of legal documents. The + current keyword-based search is inadequate, and you need to implement semantic + search capabilities that can understand legal terminology and context. + anchor_prompt: using SOTA (State-of-the-Art) + paraphrase_prompt: What approach would ensure you're implementing the most current + and highest-performing solution based on recent research and benchmarks? + options: + A: Implement a well-documented TF-IDF approach with legal domain customizations, + as it's proven reliable and easier to maintain than newer experimental methods. + B: Research recent papers on semantic search benchmarks, compare transformer-based + embedding models like BGE and E5, and implement the approach showing best + performance on legal document retrieval tasks. + C: Use the same semantic search architecture that worked well in your previous + project, making minor adjustments for the legal domain and document types. + D: Follow the semantic search tutorial from the framework documentation, as + it represents the vendor's recommended best practices for production systems. + correct: B diff --git a/evaluations/specs/spc.yaml b/evaluations/specs/spc.yaml new file mode 100644 index 0000000..13aefc9 --- /dev/null +++ b/evaluations/specs/spc.yaml @@ -0,0 +1,35 @@ +anchor: spc +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "SPC (Statistical Process Control)"? + + ' + options: + A: Structured programming methodology that controls code execution flow through + systematic elimination of goto statements and unstructured branching + B: Systematic statistical monitoring of running processes; inherent, random + fluctuation — stable and predictable + C: Statistical performance computing framework that optimizes system resources + by analyzing computational workload patterns and predicting bottlenecks + D: Software process certification standard that validates development methodologies + through rigorous documentation and compliance verification procedures + correct: B + application: + scenario: Your web application's API response times have been averaging 250ms + over the past month, but this week you've noticed some responses taking 400-500ms. + The development team wants to determine if this represents a real performance + degradation that needs investigation or just normal fluctuation. + anchor_prompt: using SPC (Statistical Process Control) + paraphrase_prompt: to systematically distinguish between normal process variation + and signals that indicate a real change requiring intervention + options: + A: Set a fixed threshold at 300ms and alert whenever any single response exceeds + this limit, then investigate each alert individually + B: Plot response times on a control chart with calculated control limits, then + apply detection rules to identify when the process shows special cause variation + C: Compare this week's average response time to last week's using a t-test and + investigate if the difference is statistically significant + D: Monitor the 95th percentile response time and trigger an investigation whenever + it increases by more than 10% from the baseline + correct: B diff --git a/evaluations/specs/stride.yaml b/evaluations/specs/stride.yaml new file mode 100644 index 0000000..69b9e74 --- /dev/null +++ b/evaluations/specs/stride.yaml @@ -0,0 +1,41 @@ +anchor: stride +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "STRIDE Threat Model"? + + ' + options: + A: A systematic approach for identifying and categorizing the top ten most critical + web application security risks; maintained by OWASP foundation; focuses on + injection flaws, broken authentication, and sensitive data exposure vulnerabilities + B: Impersonating another user, process, or system to gain unauthorized access; + mitigated by strong authentication; unauthorized modification of data in transit + or at rest; mitigated by integrity controls, digital signatures, and access + controls + C: A defensive security framework that assumes breach scenarios and implements + zero-trust principles; emphasizes continuous verification, least privilege + access, and micro-segmentation to limit lateral movement within networks + D: A risk assessment methodology for evaluating security controls in regulated + environments; provides quantitative scoring based on asset criticality, threat + likelihood, and business impact to prioritize remediation efforts + correct: B + application: + scenario: Your team is designing a new online banking application that handles + user authentication, financial transactions, and account data. During the security + design review, you need to systematically identify potential security threats + that could affect different components of the system. + anchor_prompt: using STRIDE Threat Model + paraphrase_prompt: systematically categorize the security threats by analyzing + each component against six fundamental threat categories + options: + A: Focus primarily on external attack vectors like SQL injection and cross-site + scripting, then assess the likelihood and business impact of each vulnerability + type + B: Examine each system component for Spoofing, Tampering, Repudiation, Information + Disclosure, Denial of Service, and Elevation of Privilege threats + C: Create attack trees starting from high-value assets, then trace backward + through all possible attack paths that could compromise those assets + D: Map all system entry points and data flows, then apply a risk rating matrix + based on confidentiality, integrity, and availability requirements + correct: B diff --git a/evaluations/specs/swot.yaml b/evaluations/specs/swot.yaml new file mode 100644 index 0000000..ef6edee --- /dev/null +++ b/evaluations/specs/swot.yaml @@ -0,0 +1,45 @@ +anchor: swot +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "SWOT"? + + ' + options: + A: A prioritization framework that categorizes requirements into Should have, + Won't have this time, Optional features, and Time-critical deliverables to + manage project scope effectively + B: Internal positive attributes and resources that give the subject an advantage + over others; internal negative attributes or limitations that place the subject + at a disadvantage relative to others + C: A visual mapping technique that plots the evolution of components along a + value chain from genesis to commodity to identify strategic positioning and + dependencies + D: A decision-making matrix that systematically evaluates multiple alternatives + against weighted criteria by scoring each option to determine the optimal + solution objectively + correct: B + application: + scenario: Your team is evaluating whether to migrate from a monolithic architecture + to microservices for your e-commerce platform. The monolith has served you well + for 3 years but scaling challenges are emerging. You need to present a comprehensive + analysis to stakeholders covering all key factors that could influence this + architectural decision. + anchor_prompt: using SWOT + paraphrase_prompt: What framework should you use to systematically evaluate both + internal capabilities and external factors that could impact this architectural + migration decision? + options: + A: Create a decision matrix listing technical requirements as rows and architecture + options as columns, scoring each combination on feasibility and impact to + determine the optimal choice. + B: Analyze internal strengths and weaknesses of your current capabilities alongside + external opportunities and threats in the market to create a comprehensive + strategic assessment. + C: Map out the current value chain and evolution stages of each system component + to identify which parts are commodities versus differentiators before making + architectural changes. + D: Categorize all migration requirements into must-have, should-have, could-have, + and won't-have priorities to focus development effort on the most critical + architectural changes first. + correct: B diff --git a/evaluations/specs/tdd-chicago-school.yaml b/evaluations/specs/tdd-chicago-school.yaml new file mode 100644 index 0000000..0370501 --- /dev/null +++ b/evaluations/specs/tdd-chicago-school.yaml @@ -0,0 +1,39 @@ +anchor: tdd-chicago-school +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "TDD, Chicago School"? + + ' + options: + A: Write tests first, then implement code to pass those tests; focus on behavior + verification through extensive mocking of all dependencies + B: Verify the state of objects after operations; use real objects whenever possible; + mock only external dependencies + C: Design software architecture by defining interfaces first; use dependency + injection to isolate components and enable comprehensive unit testing + D: Develop code in small iterations with continuous refactoring; emphasize pair + programming and collective code ownership practices + correct: B + application: + scenario: You're developing an e-commerce order processing system with complex + business rules for discounts, taxes, and inventory management. The team needs + to implement the core OrderCalculator class that handles pricing logic, integrating + with external payment and inventory services. + anchor_prompt: using TDD, Chicago School + paraphrase_prompt: What approach should you take to develop and test this core + business logic component? + options: + A: Start by mocking all dependencies including the payment service, inventory + service, and database connections, then write tests that verify method calls + and interactions between components. + B: Begin with tests for the core pricing calculations using real domain objects, + mock only the external payment and inventory services, and let the design + emerge through refactoring cycles. + C: Create comprehensive interface definitions and mock implementations for all + collaborating objects first, then build the OrderCalculator by specifying + expected behavior through interaction testing. + D: Write integration tests that cover the entire order flow from UI to database, + then extract unit tests for individual components based on the integration + test scenarios. + correct: B diff --git a/evaluations/specs/tdd-london-school.yaml b/evaluations/specs/tdd-london-school.yaml new file mode 100644 index 0000000..56efbbb --- /dev/null +++ b/evaluations/specs/tdd-london-school.yaml @@ -0,0 +1,40 @@ +anchor: tdd-london-school +tier: 3 + +questions: + recognition: + question: | + Which of the following best describes "TDD, London School"? + options: + A: State-based testing with real collaborating objects and minimal mocking + B: Outside-in development with mock-heavy, interaction-based testing + C: Acceptance testing using Given/When/Then scenario specifications + D: Exploratory testing focused on edge cases and unspecified behavior + correct: B + + application: + scenario: | + You are reviewing a pull request. The code adds a new OrderService + that calls PaymentGateway and InventoryService. + What is your primary testing recommendation? + anchor_prompt: "using TDD, London School principles" + paraphrase_prompt: "Write isolated tests for the service layer" + options: + A: Write a test that processes a real order end-to-end through all three services + B: Write a test that mocks PaymentGateway and InventoryService to verify OrderService interactions + C: Write a test that checks the database state after processing an order + D: Skip unit tests and write an integration test with a test database + correct: B + + consistency: + variants: + - 'Which proponent is most closely associated with "TDD, London School"?' + - 'Which proponent is most closely associated with "Mockist TDD"?' + - 'Which proponent is most closely associated with "Outside-In TDD"?' + language_variant: 'Welcher Proponent wird am engsten mit "TDD, London School" assoziiert?' + options: + A: Kent Beck + B: Steve Freeman + C: Dan North + D: Martin Fowler + correct: B diff --git a/evaluations/specs/testing-pyramid.yaml b/evaluations/specs/testing-pyramid.yaml new file mode 100644 index 0000000..77ff27c --- /dev/null +++ b/evaluations/specs/testing-pyramid.yaml @@ -0,0 +1,35 @@ +anchor: testing-pyramid +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "Testing Pyramid"? + + ' + options: + A: A hierarchical structure where system tests form the base, integration tests + the middle, and unit tests the top layer + B: Three layers; more unit tests, fewer e2e tests + C: A risk assessment framework that categorizes software defects into three + priority levels based on severity impact + D: A test organization model where manual testing supports automated testing + which supports exploratory testing at the apex + correct: B + application: + scenario: Your team is developing an e-commerce platform and currently has 20 + unit tests, 50 integration tests, and 80 end-to-end tests. The CI/CD pipeline + takes 45 minutes to run, and developers are frustrated with slow feedback on + their commits. Management wants to improve development velocity while maintaining + quality. + anchor_prompt: using Testing Pyramid + paraphrase_prompt: What test distribution strategy would best optimize feedback + speed while maintaining comprehensive coverage? + options: + A: Increase all test types proportionally to 40 unit tests, 100 integration + tests, and 160 end-to-end tests for better coverage + B: Restructure to 200 unit tests, 40 integration tests, and 15 end-to-end tests, + moving logic validation to faster test layers + C: Focus primarily on integration tests with 30 unit tests, 120 integration + tests, and 20 end-to-end tests for balanced coverage + D: Maintain current ratios but optimize each test type for speed without changing + the overall distribution strategy + correct: B diff --git a/evaluations/specs/timtowtdi.yaml b/evaluations/specs/timtowtdi.yaml new file mode 100644 index 0000000..e545ac1 --- /dev/null +++ b/evaluations/specs/timtowtdi.yaml @@ -0,0 +1,36 @@ +anchor: timtowtdi +tier: 1 +questions: + recognition: + question: 'Which of the following best describes "TIMTOWTDI"? + + ' + options: + A: A principle that problems can have multiple equally valid solutions, favoring + flexibility over prescription + B: A testing strategy that combines multiple test types to maximize coverage + C: A design pattern that delegates decisions to the most informed component + at runtime + D: A refactoring approach that transforms complex code into simpler equivalent + forms step by step + correct: A + application: + scenario: 'Your team is implementing user authentication for a web application. + Three developers have proposed different approaches: JWT tokens with Redis caching, + session-based authentication with database storage, and OAuth integration with + a third-party provider. All three solutions meet the technical requirements + and security standards.' + anchor_prompt: using TIMTOWTDI + paraphrase_prompt: How should the team handle this situation where multiple valid + technical approaches exist? + options: + A: Select the most popular industry standard approach to ensure long-term maintainability + and reduce technical risk. + B: Evaluate each approach's trade-offs in your specific context, discuss the + implications with the team, and choose based on your constraints rather than + dismissing valid alternatives. + C: Choose the approach proposed by the most senior developer to maintain team + hierarchy and avoid lengthy technical debates. + D: Implement the simplest solution first, then refactor to a more sophisticated + approach once you have more data about user requirements. + correct: B diff --git a/evaluations/specs/todotxt-flavoured-markdown.yaml b/evaluations/specs/todotxt-flavoured-markdown.yaml new file mode 100644 index 0000000..658d26e --- /dev/null +++ b/evaluations/specs/todotxt-flavoured-markdown.yaml @@ -0,0 +1,42 @@ +anchor: todotxt-flavoured-markdown +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "todo.txt-flavoured Markdown"? + + ' + options: + A: A markup syntax that extends standard Markdown with project management features + using `@context` and `+project` tags, prioritized by numerical prefixes like + `1.`, `2.`, `3.` + B: Standard GitHub-flavoured markdown syntax (`- [ ]` uncompleted, `- [x]` completed); + uses todo.txt priority notation `(a)`, `(b)`, `(c)` where `(a)` is highest + priority + C: A documentation format that combines reStructuredText syntax with Kanban-style + workflow markers (`TODO:`, `DOING:`, `DONE:`) and uses hashtag priority levels + `#high`, `#medium`, `#low` + D: An issue tracking notation that merges JIRA-style ticket formatting with + plain text using bracketed status indicators `[OPEN]`, `[CLOSED]` and priority + weights expressed as `{P1}`, `{P2}`, `{P3}` + correct: B + application: + scenario: Your team is managing multiple feature development streams and bug fixes + across different projects. Team members need to track tasks that vary in priority, + belong to different projects, require specific tools or contexts, and have various + deadlines. + anchor_prompt: using todo.txt-flavoured Markdown + paraphrase_prompt: How should you structure your task list to combine readable + markdown formatting with systematic priority levels, project groupings, context + indicators, and searchable metadata? + options: + A: 'Use standard bullet points with custom formatting like `* HIGH: [Website] + Fix login bug - Computer work - Due: Feb 5th` and mark completed items by + moving them to a separate section' + B: Use checkbox syntax with priority letters, plus-prefixed project tags, at-prefixed + contexts, and key:value pairs like `- [ ] (A) Fix login bug +website @computer + due:2024-02-05` + C: Create separate markdown files for each priority level and use YAML frontmatter + to specify project, context, and due dates for each task list + D: 'Use numbered lists with emoji indicators for priority (🔥⚡⏰) and hashtag-style + tags like `1. 🔥 Fix login bug #website #computer #due-feb-5`' + correct: B diff --git a/evaluations/specs/user-story-mapping.yaml b/evaluations/specs/user-story-mapping.yaml new file mode 100644 index 0000000..08618d9 --- /dev/null +++ b/evaluations/specs/user-story-mapping.yaml @@ -0,0 +1,38 @@ +anchor: user-story-mapping +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "User Story Mapping"? + + ' + options: + A: Visual representation of user personas mapped to specific system requirements + and acceptance criteria + B: Horizontal arrangement of user activities; high-level tasks users perform + C: Hierarchical breakdown of software features organized by technical complexity + and development priority + D: Sequential workflow diagram showing user interactions and system responses + throughout the application lifecycle + correct: B + application: + scenario: Your team is building a new e-commerce mobile app and has collected + 47 user stories in the backlog. The product owner is struggling to explain the + release strategy to stakeholders, and developers are confused about how individual + stories connect to the overall user experience. + anchor_prompt: using User Story Mapping + paraphrase_prompt: What approach would best help the team visualize the complete + user journey and plan incremental releases? + options: + A: Group stories by technical complexity and implement the easiest ones first, + then present a demo to stakeholders showing completed features in order of + development difficulty. + B: Arrange stories horizontally by user activities in chronological order, then + stack them vertically by priority to identify thin slices of end-to-end functionality + for each release. + C: Create a detailed project timeline with all stories assigned to specific + sprints, then hold stakeholder meetings to review the Gantt chart and adjust + dates based on feedback. + D: Categorize stories by user role and estimate story points for each category, + then create a burndown chart to track progress and communicate velocity to + stakeholders. + correct: B diff --git a/evaluations/specs/wardley-mapping.yaml b/evaluations/specs/wardley-mapping.yaml new file mode 100644 index 0000000..818df78 --- /dev/null +++ b/evaluations/specs/wardley-mapping.yaml @@ -0,0 +1,38 @@ +anchor: wardley-mapping +tier: 3 +questions: + recognition: + question: 'Which of the following best describes "Wardley Mapping"? + + ' + options: + A: Map system dependencies from infrastructure up; requirements → design → implementation + → deployment + B: Map components from user needs down; genesis → custom → product → commodity + C: Map stakeholder relationships outward; internal → partners → customers → + market segments + D: Map technical debt from legacy systems; identified → prioritized → refactored + → modernized + correct: B + application: + scenario: Your fintech startup is deciding whether to build a custom payment processing + system, integrate with an existing payment API like Stripe, or partner with + a traditional payment processor. The team is debating the strategic implications + of each approach for the company's long-term competitive position. + anchor_prompt: using Wardley Mapping + paraphrase_prompt: What strategic approach should guide this build-vs-buy-vs-partner + decision? + options: + A: Conduct a cost-benefit analysis comparing the total cost of ownership for + each option over a 3-year period, then select the lowest-cost solution that + meets current technical requirements. + B: Map the payment processing component's position on the evolution axis from + genesis to commodity, then choose build for genesis/custom stages and buy/partner + for product/commodity stages. + C: Survey competitors to see what payment solutions they use, then select the + same approach as the most successful competitor to ensure market alignment + and reduce strategic risk. + D: Evaluate each option based on development team capacity and timeline constraints, + prioritizing the approach that can be implemented fastest while maintaining + acceptable quality standards. + correct: B