diff --git a/docs/anchor-evaluations.adoc b/docs/anchor-evaluations.adoc index f99a734..7ed8ea6 100644 --- a/docs/anchor-evaluations.adoc +++ b/docs/anchor-evaluations.adoc @@ -407,45 +407,63 @@ For Claude, this means Sonnet 4.6 (not Opus). For GPT and Gemini, the mid-tier variants are not yet clearly established, so we test the current flagship (GPT-5, Gemini 2.5 Pro) and add smaller variants when they become available. A follow-up round with the cheapest variants (Haiku, GPT-5 mini, Gemini Flash) would reveal the lower boundary of anchor activation. +IMPORTANT: Always record the *exact model identifier with date suffix* (e.g., `mistral-large-2512`, not `mistral-large-latest`). +Model aliases like `-latest` can change without notice. + *Commercial models (API cost per call):* -[cols="1,2"] +[cols="2,1,2"] |=== -|Model |Rationale +|Model |API ID |Rationale |Claude Sonnet 4.6 +|`claude-sonnet-4-20250514` |Our primary development model. Serves as the baseline. -|GPT-5 -|Largest market share, OpenAI ecosystem. +|GPT-4o / GPT-5 +|`gpt-4o` / `gpt-5` +|OpenAI ecosystem. GPT-4o as mid-tier, GPT-5 as flagship. + +|Mistral Large 3 +|`mistral-large-2512` +|European flagship. Already tested (96%). + +|Mistral Medium 3.1 +|`mistral-medium-2508` +|European mid-tier. Frontier-class multimodal. + +|Mistral Small 4 +|`mistral-small-2603` +|European small model. Hybrid reasoning+coding (March 2026). + +|Devstral 2 +|`devstral-2512` +|Code-specialized model. Tests whether SE-focused training improves anchor recognition. |Gemini 2.5 Pro +|TBD |Google, different training approach. |=== -*Open-weight models (available as open-source):* +*Open-weight models (run locally via Ollama):* -[cols="1,2,1"] +[cols="2,1,2"] |=== -|Model |Rationale |Local? +|Model |Local? |Rationale |Llama 4 Maverick -|Largest open-weight model. Shows whether anchors work without proprietary training -- relevant for self-hosted setups. |Yes (Ollama) - -|Mistral Large -|European model with a different training focus. Interesting because the anchor catalog is heavily influenced by English-language software engineering literature. -|No (too large, use La Plateforme API) +|Largest open-weight model. Shows whether anchors work without proprietary training. |DeepSeek V3 +|Yes (Ollama) |Chinese model. Tests whether anchors work across cultural and training-data boundaries. + +|Ministral 3 8B |Yes (Ollama) +|Mistral's tiny model. Lower boundary test. |=== -Llama and DeepSeek can run locally (e.g., via Ollama) at no API cost. -Mistral Large requires the Mistral API -- it is open-weight but too large for local inference. -This means 4 models have API costs (Claude, GPT, Gemini, Mistral) and 2 run locally for free. - === Effort Estimate Each question runs 4 times (randomized option order) to control for position bias. diff --git a/evaluations/.gitignore b/evaluations/.gitignore index 0d20b64..541aa26 100644 --- a/evaluations/.gitignore +++ b/evaluations/.gitignore @@ -1 +1,3 @@ *.pyc +# Full results with raw responses (large, reproducible via pilot.py) +results/ diff --git a/evaluations/generate-report.py b/evaluations/generate-report.py index a6be43e..35c9ef3 100644 --- a/evaluations/generate-report.py +++ b/evaluations/generate-report.py @@ -19,35 +19,58 @@ from collections import defaultdict from pathlib import Path -RESULTS_DIR = Path(__file__).parent / "results" +RESULTS_DIR = Path(__file__).parent / "summaries" SPECS_DIR = Path(__file__).parent / "specs" -# Models to include and display order -MODEL_DISPLAY = { +# Fallback display names (used when config doesn't have exact model ID) +MODEL_DISPLAY_FALLBACK = { "claude": "Claude Sonnet", "claude-cli": "Claude Sonnet (CLI)", "claude-haiku": "Claude Haiku", "openai": "GPT-4o", - "mistral": "Mistral Large", + "mistral": "Mistral", "ollama": "Ollama (local)", } + +def get_model_display(backend, config): + """Get exact model display name from config.""" + if backend == "openai" and config.get("openai_model"): + return config["openai_model"] + if backend == "mistral" and config.get("mistral_model"): + return config["mistral_model"] + if backend == "deepseek" and config.get("deepseek_model"): + return config["deepseek_model"] + if backend == "ollama" and config.get("ollama_model"): + return f"ollama/{config['ollama_model']}" + if backend == "claude": + return "claude-sonnet-4-20250514" + if backend == "claude-cli": + return "claude-sonnet-4 (CLI)" + if backend == "claude-haiku": + return "claude-haiku-4-5" + return MODEL_DISPLAY_FALLBACK.get(backend, backend) + CONTROL_ANCHORS = {"sanity-check", "negative-control"} def load_best_results(): - """Load the latest result with the most questions per model.""" + """Load the latest result per unique model identifier.""" results = {} for f in sorted(RESULTS_DIR.glob("pilot-*.json")): d = json.load(open(f, encoding="utf-8")) + config = d.get("config", {}) for m, r in d["models"].items(): - if m not in results or len(r) >= len(results[m]["data"]): - results[m] = { + # Use exact model ID as key instead of backend alias + exact_id = get_model_display(m, config) + if exact_id not in results or len(r) >= len(results[exact_id]["data"]): + results[exact_id] = { "data": r, "file": f.name, - "config": d.get("config", {}), + "config": config, "duration": d.get("duration_seconds", 0), "timestamp": d.get("timestamp", ""), + "backend": m, } return results @@ -71,19 +94,15 @@ def score_bg(score): def generate_html(results, output_path): + # Keys are already exact model IDs (e.g. "mistral-large-2512") + display_names = {m: m for m in results} + # Collect all anchors and questions all_questions = defaultdict(dict) # anchor/label -> {model: score} - model_names = [] - - # Prefer full runs (75 questions) over pilot runs - for m in ["claude", "openai", "mistral"]: - if m in results and len(results[m]["data"]) >= 60: - model_names.append(m) - # Add smaller runs if no full run exists - for m in ["claude-cli", "claude-haiku", "ollama"]: - if m in results and m not in model_names: - model_names.append(m) + # Sort models: most questions first, then alphabetically + model_names = sorted(results.keys(), + key=lambda m: (-len(results[m]["data"]), m)) for m in model_names: for q in results[m]["data"]: @@ -162,7 +181,7 @@ def generate_html(results, output_path): for m in model_names: avg = model_avgs.get(m, 0) - display = MODEL_DISPLAY.get(m, m) + display = display_names.get(m, m) n = len([1 for l in anchor_questions if anchor_questions[l].get(m) is not None]) info = results[m] html += f"""
@@ -181,7 +200,7 @@ def generate_html(results, output_path): """ for m in model_names: - html += f" {MODEL_DISPLAY.get(m, m)}\n" + html += f" {display_names.get(m, m)}\n" html += "\n\n" for anchor_id in sorted(anchor_groups.keys()): @@ -224,7 +243,7 @@ def generate_html(results, output_path): if control_questions: html += '

Control Questions

\n\n' for m in model_names: - html += f"" + html += f"" html += "\n\n" for label in sorted(control_questions.keys()): short = label.replace("/recognition", "") @@ -246,9 +265,9 @@ def generate_html(results, output_path): fails = [(q["label"], q["score"]) for q in results[m]["data"] if q["score"] < 1.0 and not any(q["label"].startswith(c) for c in CONTROL_ANCHORS)] if not fails: - html += f"

{MODEL_DISPLAY.get(m, m)}: no failures

\n" + html += f"

{display_names.get(m, m)}: no failures

\n" else: - html += f'

{MODEL_DISPLAY.get(m, m)}: {len(fails)} failures

\n
\n' + html += f'

{display_names.get(m, m)}: {len(fails)} failures

\n
\n' for label, score in sorted(fails): html += f'
{h(label)}{score:.0%}
\n' html += "
\n" @@ -262,7 +281,7 @@ def generate_html(results, output_path): for m in model_names: info = results[m] dur = info["duration"] - html += f"
{MODEL_DISPLAY.get(m, m)}:
{info['file']} · {int(dur//60)}m {int(dur%60)}s · {info['timestamp'][:19]}

" + html += f"
{display_names.get(m, m)}:
{info['file']} · {int(dur//60)}m {int(dur%60)}s · {info['timestamp'][:19]}

" html += """ diff --git a/evaluations/pilot.py b/evaluations/pilot.py index c921ad0..bfc640d 100644 --- a/evaluations/pilot.py +++ b/evaluations/pilot.py @@ -312,7 +312,16 @@ def run_pilot(models, dry_run=False, verbose=False, ollama_model="qwen3:4b", no_ print() ts = datetime.now().strftime("%Y%m%d-%H%M%S") - out_file = RESULTS_DIR / f"pilot-{ts}.json" + # Include exact model IDs in filename to prevent race conditions + model_ids = [] + for m in models: + if m == "openai": model_ids.append(openai_model) + elif m == "mistral": model_ids.append(mistral_model) + elif m == "deepseek": model_ids.append(deepseek_model) + elif m == "ollama": model_ids.append(f"ollama-{ollama_model}") + else: model_ids.append(m) + model_suffix = "_".join(model_ids).replace(":", "-").replace("/", "-") + out_file = RESULTS_DIR / f"pilot-{ts}_{model_suffix}.json" all_results = { "timestamp": datetime.now(timezone.utc).isoformat(), @@ -449,6 +458,18 @@ def append_and_save(r): save_results(all_results, out_file) print(f"\nResults saved to {out_file}") + # Also save a stripped summary (scores only, no raw responses) + summary_dir = RESULTS_DIR.parent / "summaries" + summary_dir.mkdir(parents=True, exist_ok=True) + summary = json.loads(json.dumps(all_results)) # deep copy + for m_results in summary.get("models", {}).values(): + for r in m_results: + r.pop("results", None) + summary_file = summary_dir / out_file.name + with open(summary_file, "w", encoding="utf-8") as fh: + json.dump(summary, fh, indent=2, ensure_ascii=False) + print(f"Summary saved to {summary_file}") + # Summary print("\n=== SUMMARY ===") print(f"Models: {', '.join(models)}") diff --git a/evaluations/report.html b/evaluations/report.html index 0fd9fb2..584a172 100644 --- a/evaluations/report.html +++ b/evaluations/report.html @@ -37,7 +37,7 @@

Semantic Anchor Evaluation Report

-

Multiple-choice recognition test across 3 LLMs — 191 questions, 61 anchors

+

Multiple-choice recognition test across 6 LLMs — 191 questions, 61 anchors

≥80%
@@ -48,298 +48,336 @@

Semantic Anchor Evaluation Report

Model Summary

-
Claude Sonnet
+
claude-sonnet-4-20250514
99%
191 questions · pilot-20260324-174404.json
-
GPT-4o
+
devstral-2512
+
96%
+
191 questions · pilot-20260326-073241_devstral-2512.json
+
+
+
gpt-4o
98%
191 questions · pilot-20260324-192413.json
-
Mistral Large
+
mistral-large-2512
96%
191 questions · pilot-20260324-190600.json
+
+
mistral-medium-2508
+
85%
+
191 questions · pilot-20260326-070127_mistral-medium-2508.json
+
+
+
mistral-small-2603
+
74%
+
191 questions · pilot-20260326-074132_mistral-small-2603.json
+

Heatmap: Anchor × Model

Control{MODEL_DISPLAY.get(m, m)}{display_names.get(m, m)}
- - - + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Anchor / QuestionClaude SonnetGPT-4oMistral Largeclaude-sonnet-4-20250514devstral-2512gpt-4omistral-large-2512mistral-medium-2508mistral-small-2603
adr-according-to-nygard92%
application-anchor
application-paraphrase
recognition75%
arc42
application-anchor
application-paraphrase
consistency-language
consistency-variant-1
consistency-variant-2
consistency-variant-3
recognition
atam
application-anchor
application-paraphrase
recognition
bdd-given-when-then83%
application-anchor
application-paraphrase50%
recognition
bem-methodology
application-anchor
application-paraphrase
recognition
bluf
application-anchor
application-paraphrase
recognition
c4-diagrams
application-anchor
application-paraphrase
recognition
chain-of-thought
application-anchor
application-paraphrase
recognition
clean-architecture
application-anchor
application-paraphrase
recognition
control-chart-shewhart92%
application-anchor
application-paraphrase75%
recognition
conventional-commits
application-anchor
application-paraphrase
recognition
cqrs
application-anchor
application-paraphrase
recognition
cynefin-framework
application-anchor
application-paraphrase
recognition
definition-of-done
application-anchor
application-paraphrase
recognition
devils-advocate
application-anchor
application-paraphrase
recognition
diataxis-framework
application-anchor
application-paraphrase
recognition
docs-as-code
application-anchor
application-paraphrase
recognition
domain-driven-design
application-anchor
application-paraphrase
recognition
ears-requirements92%83%
application-anchor
application-paraphrase75%75%
recognition75%
event-driven-architecture
application-anchor
application-paraphrase
recognition
fagan-inspection
application-anchor
application-paraphrase
recognition
feynman-technique67%67%92%
application-anchor
application-paraphrase0%0%75%
recognition
five-whys
application-anchor
application-paraphrase
recognition
fowler-patterns
application-anchor
application-paraphrase
recognition
gherkin
application-anchor
application-paraphrase
recognition
github-flow92%92%92%
application-anchor
application-paraphrase75%75%75%
recognition
gutes-deutsch-wolf-schneider
application-anchor
application-paraphrase
recognition
hexagonal-architecture
application-anchor
application-paraphrase
recognition
iec-61508-sil-levels92%83%
application-anchor50%
application-paraphrase75%
recognition
impact-mapping
application-anchor
application-paraphrase
recognition
invest
application-anchor
application-paraphrase
recognition
iso-2501083%
application-anchor75%
application-paraphrase75%
recognition
jobs-to-be-done
application-anchor
application-paraphrase
recognition
lasr92%75%
application-anchor
application-paraphrase
recognition75%25%
linddun
application-anchor
application-paraphrase
recognition
llm-evaluations
application-anchor
application-paraphrase
recognition
madr
application-anchor
application-paraphrase
recognition
mece
application-anchor
application-paraphrase
recognition
morphological-box
application-anchor
application-paraphrase
recognition
moscow92%75%
application-anchor
application-paraphrase75%25%
recognition
mutation-testing
application-anchor
application-paraphrase
recognition
nelson-rules
application-anchor
application-paraphrase
recognition
owasp-top-10
application-anchor
application-paraphrase
recognition
plain-english-strunk-white
application-anchor
application-paraphrase
recognition
prd92%67%
application-anchor
application-paraphrase
recognition75%0%
problem-space-nvc83%
application-anchor75%
application-paraphrase75%
recognition
property-based-testing83%
application-anchor75%
application-paraphrase75%
recognition
pyramid-principle
application-anchor
application-paraphrase
recognition
semantic-versioning75%
application-anchor50%
application-paraphrase75%
recognition
socratic-method
application-anchor
application-paraphrase
recognition
sota
application-anchor
application-paraphrase
recognition
spc
application-anchor
application-paraphrase
recognition
stride
application-anchor
application-paraphrase
recognition
swot
application-anchor
application-paraphrase
recognition
tdd-chicago-school92%
application-anchor
application-paraphrase
recognition75%
tdd-london-school89%
application-anchor
application-paraphrase
consistency-language
consistency-variant-1
consistency-variant-2
consistency-variant-350%
recognition75%
testing-pyramid
application-anchor
application-paraphrase
recognition
timtowtdi
application-anchor
application-paraphrase
recognition
todotxt-flavoured-markdown83%
application-anchor
application-paraphrase
recognition50%
user-story-mapping
application-anchor
application-paraphrase
recognition
wardley-mapping
application-anchor
application-paraphrase
recognition
adr-according-to-nygard92%50%83%
application-anchor50%75%
application-paraphrase25%
recognition75%75%75%
arc4246%
application-anchor75%
application-paraphrase50%
consistency-language0%
consistency-variant-125%
consistency-variant-250%
consistency-variant-350%
recognition75%
atam83%58%92%
application-anchor50%75%
application-paraphrase50%50%
recognition75%
bdd-given-when-then50%83%33%92%
application-anchor50%50%
application-paraphrase50%25%75%
recognition0%25%
bem-methodology42%
application-anchor50%
application-paraphrase25%
recognition50%
bluf75%
application-anchor75%
application-paraphrase75%
recognition75%
c4-diagrams25%
application-anchor50%
application-paraphrase0%
recognition25%
chain-of-thought58%
application-anchor50%
application-paraphrase75%
recognition50%
clean-architecture
application-anchor
application-paraphrase
recognition
control-chart-shewhart92%92%
application-anchor75%
application-paraphrase75%
recognition
conventional-commits75%
application-anchor50%
application-paraphrase75%
recognition
cqrs92%75%
application-anchor75%75%
application-paraphrase50%
recognition
cynefin-framework92%
application-anchor75%
application-paraphrase
recognition
definition-of-done83%67%
application-anchor75%
application-paraphrase75%50%
recognition75%75%
devils-advocate58%
application-anchor75%
application-paraphrase50%
recognition50%
diataxis-framework92%58%
application-anchor75%
application-paraphrase75%
recognition75%25%
docs-as-code92%83%
application-anchor
application-paraphrase75%50%
recognition
domain-driven-design67%
application-anchor
application-paraphrase50%
recognition50%
ears-requirements83%92%83%17%
application-anchor75%0%
application-paraphrase75%75%75%50%
recognition75%0%
event-driven-architecture83%75%
application-anchor75%
application-paraphrase75%75%
recognition75%75%
fagan-inspection83%58%
application-anchor75%25%
application-paraphrase75%75%
recognition75%
feynman-technique67%83%67%92%92%75%
application-anchor75%50%
application-paraphrase0%75%0%75%75%75%
recognition
five-whys92%
application-anchor75%
application-paraphrase
recognition
fowler-patterns92%75%
application-anchor75%75%
application-paraphrase
recognition50%
gherkin83%
application-anchor75%
application-paraphrase
recognition75%
github-flow92%92%92%92%42%
application-anchor50%
application-paraphrase75%75%75%75%0%
recognition75%
gutes-deutsch-wolf-schneider83%92%
application-anchor75%
application-paraphrase
recognition75%75%
hexagonal-architecture92%92%
application-anchor
application-paraphrase75%
recognition75%
iec-61508-sil-levels83%92%83%92%33%
application-anchor75%50%0%
application-paraphrase75%75%0%
recognition75%
impact-mapping50%
application-anchor50%
application-paraphrase25%
recognition75%
invest75%
application-anchor
application-paraphrase
recognition25%
iso-2501083%75%
application-anchor75%50%
application-paraphrase75%75%
recognition
jobs-to-be-done92%83%
application-anchor50%
application-paraphrase
recognition75%
lasr83%92%75%58%
application-anchor50%
application-paraphrase75%
recognition50%75%25%50%
linddun92%83%
application-anchor
application-paraphrase75%50%
recognition
llm-evaluations83%
application-anchor75%
application-paraphrase
recognition75%
madr83%75%67%
application-anchor50%
application-paraphrase75%
recognition50%25%75%
mece92%83%
application-anchor75%50%
application-paraphrase
recognition
morphological-box92%67%
application-anchor75%25%
application-paraphrase75%
recognition
moscow92%75%92%75%
application-anchor
application-paraphrase75%25%75%50%
recognition75%
mutation-testing92%58%
application-anchor75%75%
application-paraphrase25%
recognition75%
nelson-rules92%83%
application-anchor75%50%
application-paraphrase
recognition
owasp-top-1092%67%
application-anchor75%75%
application-paraphrase50%
recognition75%
plain-english-strunk-white58%
application-anchor50%
application-paraphrase75%
recognition50%
prd83%92%67%83%58%
application-anchor50%
application-paraphrase
recognition50%75%0%50%25%
problem-space-nvc83%50%
application-anchor75%25%
application-paraphrase75%50%
recognition75%
property-based-testing92%83%92%67%
application-anchor75%75%25%
application-paraphrase75%
recognition75%75%
pyramid-principle92%83%
application-anchor75%75%
application-paraphrase75%
recognition
semantic-versioning75%83%92%
application-anchor50%75%
application-paraphrase75%75%75%
recognition
socratic-method92%67%
application-anchor75%
application-paraphrase75%
recognition25%
sota83%75%
application-anchor75%50%
application-paraphrase75%75%
recognition
spc92%50%
application-anchor75%25%
application-paraphrase75%
recognition50%
stride92%67%
application-anchor50%
application-paraphrase
recognition75%50%
swot83%67%
application-anchor25%
application-paraphrase75%75%
recognition75%
tdd-chicago-school92%92%67%
application-anchor
application-paraphrase50%
recognition75%75%50%
tdd-london-school89%89%71%54%
application-anchor50%
application-paraphrase75%75%
consistency-language75%50%
consistency-variant-175%50%
consistency-variant-275%
consistency-variant-325%50%0%25%
recognition75%75%50%
testing-pyramid83%92%
application-anchor
application-paraphrase75%75%
recognition75%
timtowtdi83%83%
application-anchor75%
application-paraphrase75%
recognition75%75%
todotxt-flavoured-markdown83%83%92%75%
application-anchor75%
application-paraphrase75%
recognition50%50%75%75%
user-story-mapping92%75%
application-anchor
application-paraphrase50%
recognition75%75%
wardley-mapping42%
application-anchor50%
application-paraphrase25%
recognition50%

Control Questions

- + - - + +
ControlClaude SonnetGPT-4oMistral Large
Controlclaude-sonnet-4-20250514devstral-2512gpt-4omistral-large-2512mistral-medium-2508mistral-small-2603
negative-control100%100%75%
sanity-check0%0%0%
negative-control100%75%100%75%100%50%
sanity-check0%0%0%0%0%0%

Failures Detail

-

Claude Sonnet: 2 failures

+

claude-sonnet-4-20250514: 2 failures

feynman-technique/application-paraphrase0%
github-flow/application-paraphrase75%
-

GPT-4o: 13 failures

+

devstral-2512: 17 failures

+
+
atam/application-paraphrase50%
+
bdd-given-when-then/application-anchor50%
+
bdd-given-when-then/recognition0%
+
ears-requirements/application-anchor75%
+
ears-requirements/application-paraphrase75%
+
feynman-technique/application-anchor75%
+
feynman-technique/application-paraphrase75%
+
github-flow/application-paraphrase75%
+
iec-61508-sil-levels/application-anchor75%
+
iec-61508-sil-levels/application-paraphrase75%
+
lasr/recognition50%
+
madr/recognition50%
+
prd/recognition50%
+
property-based-testing/application-anchor75%
+
tdd-chicago-school/recognition75%
+
tdd-london-school/consistency-variant-325%
+
todotxt-flavoured-markdown/recognition50%
+
+

gpt-4o: 13 failures

control-chart-shewhart/application-paraphrase75%
ears-requirements/application-paraphrase75%
@@ -355,7 +393,7 @@

GPT-4o: 13 failures

tdd-london-school/consistency-variant-350%
tdd-london-school/recognition75%
-

Mistral Large: 17 failures

+

mistral-large-2512: 17 failures

adr-according-to-nygard/recognition75%
bdd-given-when-then/application-paraphrase50%
@@ -375,11 +413,209 @@

Mistral Large: 17 failures

semantic-versioning/application-paraphrase75%
todotxt-flavoured-markdown/recognition50%
+

mistral-medium-2508: 77 failures

+
+
adr-according-to-nygard/application-anchor50%
+
adr-according-to-nygard/application-paraphrase25%
+
adr-according-to-nygard/recognition75%
+
arc42/application-anchor75%
+
arc42/application-paraphrase50%
+
arc42/consistency-language0%
+
arc42/consistency-variant-125%
+
arc42/consistency-variant-250%
+
arc42/consistency-variant-350%
+
arc42/recognition75%
+
atam/application-anchor50%
+
atam/application-paraphrase50%
+
atam/recognition75%
+
bdd-given-when-then/application-anchor50%
+
bdd-given-when-then/application-paraphrase25%
+
bdd-given-when-then/recognition25%
+
bem-methodology/application-anchor50%
+
bem-methodology/application-paraphrase25%
+
bem-methodology/recognition50%
+
bluf/application-anchor75%
+
bluf/application-paraphrase75%
+
bluf/recognition75%
+
c4-diagrams/application-anchor50%
+
c4-diagrams/application-paraphrase0%
+
c4-diagrams/recognition25%
+
chain-of-thought/application-anchor50%
+
chain-of-thought/application-paraphrase75%
+
chain-of-thought/recognition50%
+
control-chart-shewhart/application-anchor75%
+
cqrs/application-anchor75%
+
cynefin-framework/application-anchor75%
+
definition-of-done/application-paraphrase75%
+
definition-of-done/recognition75%
+
diataxis-framework/recognition75%
+
docs-as-code/application-paraphrase75%
+
event-driven-architecture/application-paraphrase75%
+
event-driven-architecture/recognition75%
+
fagan-inspection/application-anchor75%
+
fagan-inspection/application-paraphrase75%
+
feynman-technique/application-paraphrase75%
+
fowler-patterns/application-anchor75%
+
gutes-deutsch-wolf-schneider/application-anchor75%
+
gutes-deutsch-wolf-schneider/recognition75%
+
hexagonal-architecture/application-paraphrase75%
+
iec-61508-sil-levels/recognition75%
+
jobs-to-be-done/recognition75%
+
linddun/application-paraphrase75%
+
madr/recognition25%
+
mece/application-anchor75%
+
morphological-box/application-anchor75%
+
moscow/application-paraphrase75%
+
mutation-testing/application-anchor75%
+
nelson-rules/application-anchor75%
+
owasp-top-10/application-anchor75%
+
prd/recognition50%
+
property-based-testing/recognition75%
+
pyramid-principle/application-anchor75%
+
semantic-versioning/application-anchor75%
+
semantic-versioning/application-paraphrase75%
+
socratic-method/application-anchor75%
+
sota/application-anchor75%
+
sota/application-paraphrase75%
+
spc/application-anchor75%
+
stride/recognition75%
+
swot/application-paraphrase75%
+
swot/recognition75%
+
tdd-london-school/application-paraphrase75%
+
tdd-london-school/consistency-language75%
+
tdd-london-school/consistency-variant-175%
+
tdd-london-school/consistency-variant-30%
+
tdd-london-school/recognition75%
+
testing-pyramid/application-paraphrase75%
+
testing-pyramid/recognition75%
+
timtowtdi/application-paraphrase75%
+
timtowtdi/recognition75%
+
todotxt-flavoured-markdown/recognition75%
+
user-story-mapping/recognition75%
+
+

mistral-small-2603: 115 failures

+
+
adr-according-to-nygard/application-anchor75%
+
adr-according-to-nygard/recognition75%
+
atam/application-anchor75%
+
bdd-given-when-then/application-paraphrase75%
+
conventional-commits/application-anchor50%
+
conventional-commits/application-paraphrase75%
+
cqrs/application-anchor75%
+
cqrs/application-paraphrase50%
+
definition-of-done/application-anchor75%
+
definition-of-done/application-paraphrase50%
+
definition-of-done/recognition75%
+
devils-advocate/application-anchor75%
+
devils-advocate/application-paraphrase50%
+
devils-advocate/recognition50%
+
diataxis-framework/application-anchor75%
+
diataxis-framework/application-paraphrase75%
+
diataxis-framework/recognition25%
+
docs-as-code/application-paraphrase50%
+
domain-driven-design/application-paraphrase50%
+
domain-driven-design/recognition50%
+
ears-requirements/application-anchor0%
+
ears-requirements/application-paraphrase50%
+
ears-requirements/recognition0%
+
event-driven-architecture/application-anchor75%
+
event-driven-architecture/application-paraphrase75%
+
event-driven-architecture/recognition75%
+
fagan-inspection/application-anchor25%
+
fagan-inspection/application-paraphrase75%
+
fagan-inspection/recognition75%
+
feynman-technique/application-anchor50%
+
feynman-technique/application-paraphrase75%
+
five-whys/application-anchor75%
+
fowler-patterns/application-anchor75%
+
fowler-patterns/recognition50%
+
gherkin/application-anchor75%
+
gherkin/recognition75%
+
github-flow/application-anchor50%
+
github-flow/application-paraphrase0%
+
github-flow/recognition75%
+
gutes-deutsch-wolf-schneider/recognition75%
+
hexagonal-architecture/recognition75%
+
iec-61508-sil-levels/application-anchor0%
+
iec-61508-sil-levels/application-paraphrase0%
+
impact-mapping/application-anchor50%
+
impact-mapping/application-paraphrase25%
+
impact-mapping/recognition75%
+
invest/recognition25%
+
iso-25010/application-anchor50%
+
iso-25010/application-paraphrase75%
+
jobs-to-be-done/application-anchor50%
+
lasr/application-anchor50%
+
lasr/application-paraphrase75%
+
lasr/recognition50%
+
linddun/application-paraphrase50%
+
llm-evaluations/application-anchor75%
+
llm-evaluations/recognition75%
+
madr/application-anchor50%
+
madr/application-paraphrase75%
+
madr/recognition75%
+
mece/application-anchor50%
+
morphological-box/application-anchor25%
+
morphological-box/application-paraphrase75%
+
moscow/application-paraphrase50%
+
moscow/recognition75%
+
mutation-testing/application-anchor75%
+
mutation-testing/application-paraphrase25%
+
mutation-testing/recognition75%
+
nelson-rules/application-anchor50%
+
owasp-top-10/application-anchor75%
+
owasp-top-10/application-paraphrase50%
+
owasp-top-10/recognition75%
+
plain-english-strunk-white/application-anchor50%
+
plain-english-strunk-white/application-paraphrase75%
+
plain-english-strunk-white/recognition50%
+
prd/application-anchor50%
+
prd/recognition25%
+
problem-space-nvc/application-anchor25%
+
problem-space-nvc/application-paraphrase50%
+
problem-space-nvc/recognition75%
+
property-based-testing/application-anchor25%
+
property-based-testing/recognition75%
+
pyramid-principle/application-anchor75%
+
pyramid-principle/application-paraphrase75%
+
semantic-versioning/application-paraphrase75%
+
socratic-method/application-paraphrase75%
+
socratic-method/recognition25%
+
sota/application-anchor50%
+
sota/application-paraphrase75%
+
spc/application-anchor25%
+
spc/application-paraphrase75%
+
spc/recognition50%
+
stride/application-anchor50%
+
stride/recognition50%
+
swot/application-anchor25%
+
swot/application-paraphrase75%
+
tdd-chicago-school/application-paraphrase50%
+
tdd-chicago-school/recognition50%
+
tdd-london-school/application-anchor50%
+
tdd-london-school/application-paraphrase75%
+
tdd-london-school/consistency-language50%
+
tdd-london-school/consistency-variant-150%
+
tdd-london-school/consistency-variant-275%
+
tdd-london-school/consistency-variant-325%
+
tdd-london-school/recognition50%
+
testing-pyramid/application-paraphrase75%
+
timtowtdi/application-anchor75%
+
timtowtdi/recognition75%
+
todotxt-flavoured-markdown/application-anchor75%
+
todotxt-flavoured-markdown/application-paraphrase75%
+
todotxt-flavoured-markdown/recognition75%
+
user-story-mapping/application-paraphrase50%
+
user-story-mapping/recognition75%
+
wardley-mapping/application-anchor50%
+
wardley-mapping/application-paraphrase25%
+
wardley-mapping/recognition50%
+

Run Metadata

-
Claude Sonnet:
pilot-20260324-174404.json · 81m 2s · 2026-03-24T17:44:04

GPT-4o:
pilot-20260324-192413.json · 15m 38s · 2026-03-24T19:24:13

Mistral Large:
pilot-20260324-190600.json · 16m 58s · 2026-03-24T19:06:00

+
claude-sonnet-4-20250514:
pilot-20260324-174404.json · 81m 2s · 2026-03-24T17:44:04

devstral-2512:
pilot-20260326-073241_devstral-2512.json · 15m 19s · 2026-03-26T07:32:41

gpt-4o:
pilot-20260324-192413.json · 15m 38s · 2026-03-24T19:24:13

mistral-large-2512:
pilot-20260324-190600.json · 16m 58s · 2026-03-24T19:06:00

mistral-medium-2508:
pilot-20260326-070127_mistral-medium-2508.json · 31m 12s · 2026-03-26T07:01:27

mistral-small-2603:
pilot-20260326-074132_mistral-small-2603.json · 18m 9s · 2026-03-26T07:41:32

Generated by evaluations/generate-report.py · Position bias mitigation: 4 permutations per question · Scoring: deterministic MC (no LLM judge)

diff --git a/evaluations/results/pilot-20260324-174404.json b/evaluations/results/pilot-20260324-174404.json deleted file mode 100644 index 110f33a..0000000 --- a/evaluations/results/pilot-20260324-174404.json +++ /dev/null @@ -1,10442 +0,0 @@ -{ - "timestamp": "2026-03-24T17:44:04.891380+00:00", - "config": { - "models": [ - "claude" - ], - "openai_model": null, - "mistral_model": null, - "deepseek_model": null, - "ollama_model": null, - "ollama_url": null, - "no_think": null, - "temperature": 0.0 - }, - "models": { - "claude": [ - { - "label": "adr-according-to-nygard/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "adr-according-to-nygard/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "adr-according-to-nygard/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "arc42/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - } - ] - }, - { - "label": "arc42/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "arc42/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "arc42/consistency-variant-1", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "arc42/consistency-variant-2", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "arc42/consistency-variant-3", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "arc42/consistency-language", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "atam/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "atam/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "atam/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "bdd-given-when-then/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "bdd-given-when-then/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "bdd-given-when-then/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "bem-methodology/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "bem-methodology/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "bem-methodology/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "bluf/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "bluf/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "bluf/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "c4-diagrams/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "c4-diagrams/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "c4-diagrams/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "chain-of-thought/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "chain-of-thought/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "chain-of-thought/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "clean-architecture/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "clean-architecture/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "clean-architecture/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "control-chart-shewhart/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "control-chart-shewhart/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "control-chart-shewhart/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "conventional-commits/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "conventional-commits/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "conventional-commits/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "cqrs/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "cqrs/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "cqrs/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "cynefin-framework/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "cynefin-framework/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "cynefin-framework/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "definition-of-done/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "definition-of-done/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "definition-of-done/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "devils-advocate/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "devils-advocate/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "devils-advocate/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "diataxis-framework/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "diataxis-framework/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "diataxis-framework/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "docs-as-code/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "docs-as-code/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "docs-as-code/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "domain-driven-design/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "domain-driven-design/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "domain-driven-design/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "ears-requirements/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "ears-requirements/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "ears-requirements/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "event-driven-architecture/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "event-driven-architecture/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "event-driven-architecture/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "fagan-inspection/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "fagan-inspection/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "fagan-inspection/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "feynman-technique/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "feynman-technique/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "feynman-technique/application-paraphrase", - "score": 0.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "C", - "correct": false, - "raw_response": "C" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "B", - "correct": false, - "raw_response": "B" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "A", - "correct": false, - "raw_response": "A" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "D", - "correct": false, - "raw_response": "D" - } - ] - }, - { - "label": "five-whys/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "five-whys/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "five-whys/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "fowler-patterns/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "fowler-patterns/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "fowler-patterns/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "gherkin/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - } - ] - }, - { - "label": "gherkin/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "gherkin/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "github-flow/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "github-flow/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "github-flow/application-paraphrase", - "score": 0.75, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "A", - "correct": false, - "raw_response": "A" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "gutes-deutsch-wolf-schneider/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "gutes-deutsch-wolf-schneider/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "gutes-deutsch-wolf-schneider/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "hexagonal-architecture/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "hexagonal-architecture/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "hexagonal-architecture/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "iec-61508-sil-levels/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "iec-61508-sil-levels/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "iec-61508-sil-levels/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "impact-mapping/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "impact-mapping/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "impact-mapping/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "invest/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "invest/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "invest/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "iso-25010/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "iso-25010/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "iso-25010/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "jobs-to-be-done/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "jobs-to-be-done/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "jobs-to-be-done/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "lasr/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "lasr/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "lasr/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "linddun/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "linddun/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "linddun/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "llm-evaluations/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "llm-evaluations/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "llm-evaluations/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "madr/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "madr/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "madr/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "mece/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "mece/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "mece/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "morphological-box/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "morphological-box/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "morphological-box/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "moscow/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "moscow/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "moscow/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "mutation-testing/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "mutation-testing/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "mutation-testing/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "negative-control/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - } - ] - }, - { - "label": "nelson-rules/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "nelson-rules/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "nelson-rules/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "owasp-top-10/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - } - ] - }, - { - "label": "owasp-top-10/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "owasp-top-10/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "plain-english-strunk-white/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "plain-english-strunk-white/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "plain-english-strunk-white/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "prd/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "prd/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "prd/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "problem-space-nvc/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "problem-space-nvc/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "problem-space-nvc/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "property-based-testing/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "property-based-testing/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "property-based-testing/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "pyramid-principle/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "pyramid-principle/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "pyramid-principle/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "sanity-check/recognition", - "score": 0.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "X", - "answer": "B", - "correct": false, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "X", - "answer": null, - "correct": false, - "raw_response": "None of the options provided match the famous answer from" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "X", - "answer": null, - "correct": false, - "raw_response": "None of the options provided match the correct answer from" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "X", - "answer": null, - "correct": false, - "raw_response": "None of the options provided match the famous answer from" - } - ] - }, - { - "label": "semantic-versioning/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "semantic-versioning/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "semantic-versioning/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "socratic-method/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "socratic-method/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "socratic-method/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "sota/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "sota/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "sota/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "spc/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "spc/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "spc/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "stride/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "stride/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "stride/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "swot/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "swot/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "swot/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "tdd-chicago-school/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "tdd-chicago-school/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "tdd-chicago-school/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "tdd-london-school/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "tdd-london-school/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "tdd-london-school/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "tdd-london-school/consistency-variant-1", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "tdd-london-school/consistency-variant-2", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "tdd-london-school/consistency-variant-3", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "tdd-london-school/consistency-language", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "testing-pyramid/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "testing-pyramid/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "testing-pyramid/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "timtowtdi/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - } - ] - }, - { - "label": "timtowtdi/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "timtowtdi/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "todotxt-flavoured-markdown/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "todotxt-flavoured-markdown/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "todotxt-flavoured-markdown/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "user-story-mapping/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "user-story-mapping/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "user-story-mapping/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "wardley-mapping/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "wardley-mapping/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "wardley-mapping/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - } - ] - }, - "duration_seconds": 4862.1 -} \ No newline at end of file diff --git a/evaluations/results/pilot-20260324-190600.json b/evaluations/results/pilot-20260324-190600.json deleted file mode 100644 index 613af48..0000000 --- a/evaluations/results/pilot-20260324-190600.json +++ /dev/null @@ -1,10442 +0,0 @@ -{ - "timestamp": "2026-03-24T19:06:00.394684+00:00", - "config": { - "models": [ - "mistral" - ], - "openai_model": null, - "mistral_model": "mistral-large-latest", - "deepseek_model": null, - "ollama_model": null, - "ollama_url": null, - "no_think": null, - "temperature": 0.0 - }, - "models": { - "mistral": [ - { - "label": "adr-according-to-nygard/recognition", - "score": 0.75, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "A", - "correct": false, - "raw_response": "A" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "adr-according-to-nygard/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "adr-according-to-nygard/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "arc42/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - } - ] - }, - { - "label": "arc42/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "arc42/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "arc42/consistency-variant-1", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "arc42/consistency-variant-2", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "arc42/consistency-variant-3", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "arc42/consistency-language", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "atam/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "atam/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "atam/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "bdd-given-when-then/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "bdd-given-when-then/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "bdd-given-when-then/application-paraphrase", - "score": 0.5, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": null, - "correct": false, - "raw_response": "ERROR: Error code: 429 - {'object': 'error', 'message': 'Rate limit exceeded', 'type': 'rate_limited', 'param': None, 'code': '1300', 'raw_status_code': 429}" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": null, - "correct": false, - "raw_response": "ERROR: Error code: 429 - {'object': 'error', 'message': 'Rate limit exceeded', 'type': 'rate_limited', 'param': None, 'code': '1300', 'raw_status_code': 429}" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "bem-methodology/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "bem-methodology/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "bem-methodology/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "bluf/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "bluf/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "bluf/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "c4-diagrams/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "c4-diagrams/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "c4-diagrams/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "chain-of-thought/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "chain-of-thought/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "chain-of-thought/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "clean-architecture/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "clean-architecture/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "clean-architecture/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "control-chart-shewhart/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "control-chart-shewhart/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "control-chart-shewhart/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "conventional-commits/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "conventional-commits/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "conventional-commits/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "cqrs/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "cqrs/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "cqrs/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "cynefin-framework/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "cynefin-framework/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "cynefin-framework/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "definition-of-done/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "definition-of-done/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "definition-of-done/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "devils-advocate/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "devils-advocate/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "devils-advocate/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "diataxis-framework/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "diataxis-framework/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "diataxis-framework/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "docs-as-code/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "docs-as-code/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "docs-as-code/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "domain-driven-design/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "domain-driven-design/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "domain-driven-design/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "ears-requirements/recognition", - "score": 0.75, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "C", - "correct": false, - "raw_response": "C)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "ears-requirements/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "ears-requirements/application-paraphrase", - "score": 0.75, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "B", - "correct": false, - "raw_response": "B)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "event-driven-architecture/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "event-driven-architecture/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "event-driven-architecture/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "fagan-inspection/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "fagan-inspection/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "fagan-inspection/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "feynman-technique/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "feynman-technique/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "feynman-technique/application-paraphrase", - "score": 0.75, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "B", - "correct": false, - "raw_response": "B" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "five-whys/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "five-whys/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "five-whys/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "fowler-patterns/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "fowler-patterns/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "fowler-patterns/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "gherkin/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - } - ] - }, - { - "label": "gherkin/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "gherkin/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "github-flow/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "github-flow/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "github-flow/application-paraphrase", - "score": 0.75, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "A", - "correct": false, - "raw_response": "A" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "gutes-deutsch-wolf-schneider/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "gutes-deutsch-wolf-schneider/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "gutes-deutsch-wolf-schneider/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "hexagonal-architecture/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "hexagonal-architecture/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "hexagonal-architecture/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "iec-61508-sil-levels/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "iec-61508-sil-levels/application-anchor", - "score": 0.5, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "B", - "correct": false, - "raw_response": "B" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "D", - "correct": false, - "raw_response": "D" - } - ] - }, - { - "label": "iec-61508-sil-levels/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "impact-mapping/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "impact-mapping/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "impact-mapping/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "invest/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "invest/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "invest/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "iso-25010/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "iso-25010/application-anchor", - "score": 0.75, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "B", - "correct": false, - "raw_response": "B" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "iso-25010/application-paraphrase", - "score": 0.75, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "B", - "correct": false, - "raw_response": "B" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "jobs-to-be-done/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "jobs-to-be-done/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "jobs-to-be-done/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "lasr/recognition", - "score": 0.25, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "A", - "correct": false, - "raw_response": "A" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "C", - "correct": false, - "raw_response": "C" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "A", - "correct": false, - "raw_response": "A" - } - ] - }, - { - "label": "lasr/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "lasr/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "linddun/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "linddun/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "linddun/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "llm-evaluations/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "llm-evaluations/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "llm-evaluations/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "madr/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "madr/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "madr/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "mece/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "mece/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "mece/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "morphological-box/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "morphological-box/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "morphological-box/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "moscow/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "moscow/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "moscow/application-paraphrase", - "score": 0.25, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "A", - "correct": false, - "raw_response": "A" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "D", - "correct": false, - "raw_response": "D" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "B", - "correct": false, - "raw_response": "B" - } - ] - }, - { - "label": "mutation-testing/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "mutation-testing/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "mutation-testing/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "negative-control/recognition", - "score": 0.75, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "D", - "answer": "B", - "correct": false, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - } - ] - }, - { - "label": "nelson-rules/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "nelson-rules/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "nelson-rules/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "owasp-top-10/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - } - ] - }, - { - "label": "owasp-top-10/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "owasp-top-10/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "plain-english-strunk-white/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "plain-english-strunk-white/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "plain-english-strunk-white/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "prd/recognition", - "score": 0.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "D", - "correct": false, - "raw_response": "D" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "C", - "correct": false, - "raw_response": "C" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "B", - "correct": false, - "raw_response": "B" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "A", - "correct": false, - "raw_response": "A" - } - ] - }, - { - "label": "prd/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "prd/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "problem-space-nvc/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "problem-space-nvc/application-anchor", - "score": 0.75, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "C", - "correct": false, - "raw_response": "C" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "problem-space-nvc/application-paraphrase", - "score": 0.75, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "C", - "correct": false, - "raw_response": "C" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "property-based-testing/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "property-based-testing/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "property-based-testing/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "pyramid-principle/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "pyramid-principle/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "pyramid-principle/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "sanity-check/recognition", - "score": 0.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "X", - "answer": null, - "correct": false, - "raw_response": "None of the above options is correct, but the" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "X", - "answer": null, - "correct": false, - "raw_response": "None of the options provided is correct, but the" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "X", - "answer": null, - "correct": false, - "raw_response": "None of the options provided is correct, but the" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "X", - "answer": "C", - "correct": false, - "raw_response": "C" - } - ] - }, - { - "label": "semantic-versioning/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "semantic-versioning/application-anchor", - "score": 0.5, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "C", - "correct": false, - "raw_response": "C" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "D", - "correct": false, - "raw_response": "D" - } - ] - }, - { - "label": "semantic-versioning/application-paraphrase", - "score": 0.75, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "D", - "correct": false, - "raw_response": "D" - } - ] - }, - { - "label": "socratic-method/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "socratic-method/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "socratic-method/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "sota/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "sota/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "sota/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "spc/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "spc/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "spc/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "stride/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "None of the options perfectly describe the **STRIDE" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "stride/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "stride/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "swot/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "swot/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A) SWOT Analysis\nB) Value Chain" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "swot/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "tdd-chicago-school/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "tdd-chicago-school/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "tdd-chicago-school/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "tdd-london-school/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "tdd-london-school/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "tdd-london-school/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "tdd-london-school/consistency-variant-1", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "tdd-london-school/consistency-variant-2", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "tdd-london-school/consistency-variant-3", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "tdd-london-school/consistency-language", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "testing-pyramid/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "testing-pyramid/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "testing-pyramid/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "timtowtdi/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - } - ] - }, - { - "label": "timtowtdi/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "timtowtdi/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "todotxt-flavoured-markdown/recognition", - "score": 0.5, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "A", - "correct": false, - "raw_response": "A" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "D", - "correct": false, - "raw_response": "D" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "todotxt-flavoured-markdown/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "todotxt-flavoured-markdown/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "user-story-mapping/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "user-story-mapping/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "user-story-mapping/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "wardley-mapping/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "wardley-mapping/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "wardley-mapping/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - } - ] - }, - "duration_seconds": 1018.4 -} \ No newline at end of file diff --git a/evaluations/results/pilot-20260324-192413.json b/evaluations/results/pilot-20260324-192413.json deleted file mode 100644 index 8c5f85e..0000000 --- a/evaluations/results/pilot-20260324-192413.json +++ /dev/null @@ -1,10442 +0,0 @@ -{ - "timestamp": "2026-03-24T19:24:13.551875+00:00", - "config": { - "models": [ - "openai" - ], - "openai_model": "gpt-4o", - "mistral_model": null, - "deepseek_model": null, - "ollama_model": null, - "ollama_url": null, - "no_think": null, - "temperature": 0.0 - }, - "models": { - "openai": [ - { - "label": "adr-according-to-nygard/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "adr-according-to-nygard/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "adr-according-to-nygard/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "arc42/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) A 12-section template for standardized software" - } - ] - }, - { - "label": "arc42/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "arc42/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) External interfaces in Section 3 (Context" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "arc42/consistency-variant-1", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) Gernot Starke" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A) Gernot Starke" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D) Gernot Starke" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C) Gernot Starke" - } - ] - }, - { - "label": "arc42/consistency-variant-2", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) Gernot Starke" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A) Gernot Starke" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D) Gernot Starke" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C) Gernot Starke" - } - ] - }, - { - "label": "arc42/consistency-variant-3", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) Gernot Starke" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A) Gernot Starke" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D) Gernot Starke" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C) Gernot Starke" - } - ] - }, - { - "label": "arc42/consistency-language", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "atam/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "atam/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "atam/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) Build a utility tree to prioritize quality scenarios" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D) Build a utility tree to prioritize quality scenarios" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C) Build a utility tree to prioritize quality scenarios" - } - ] - }, - { - "label": "bdd-given-when-then/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "bdd-given-when-then/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "bdd-given-when-then/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D) Organize discovery workshops with the three amigos" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "bem-methodology/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "bem-methodology/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) Use structured class names like `.navbar`," - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C) Use structured class names like `.navbar`," - } - ] - }, - { - "label": "bem-methodology/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) Use structured class names like `.navbar`," - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D) Use structured class names like `.navbar`," - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C) Use structured class names like `.navbar`," - } - ] - }, - { - "label": "bluf/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "bluf/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "bluf/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "c4-diagrams/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A) Four levels of abstraction; : system in" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C) Four levels of abstraction; : system in" - } - ] - }, - { - "label": "c4-diagrams/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "c4-diagrams/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "chain-of-thought/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "chain-of-thought/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "chain-of-thought/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "clean-architecture/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "clean-architecture/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) Define payment processing use cases in the core" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "clean-architecture/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "control-chart-shewhart/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C) Measured value plotted over time; process" - } - ] - }, - { - "label": "control-chart-shewhart/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "control-chart-shewhart/application-paraphrase", - "score": 0.75, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "A", - "correct": false, - "raw_response": "A" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "conventional-commits/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "conventional-commits/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "conventional-commits/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "cqrs/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "cqrs/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "cqrs/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) Create separate optimized data models: a normalized" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A) Create separate optimized data models: a normalized" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C) Create separate optimized data models: a normalized" - } - ] - }, - { - "label": "cynefin-framework/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) Five domains; : best practices apply," - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C) Five domains; : best practices apply," - } - ] - }, - { - "label": "cynefin-framework/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "cynefin-framework/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "definition-of-done/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "definition-of-done/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "definition-of-done/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) Collaboratively create a single, team-wide" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "devils-advocate/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A) Present opposing viewpoints even if not personally held" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "devils-advocate/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "devils-advocate/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A) Systematically argue against your own design by" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "diataxis-framework/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D) Four documentation types; : learning-oriented," - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "diataxis-framework/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) Develop four distinct documentation sections: beginner tutorials" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "diataxis-framework/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) Develop four distinct documentation sections: beginner tutorials" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D) Develop four distinct documentation sections: beginner tutorials" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "docs-as-code/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "docs-as-code/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "docs-as-code/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "domain-driven-design/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "domain-driven-design/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "domain-driven-design/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) Establish a ubiquitous language by working closely with" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "ears-requirements/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "ears-requirements/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) Structure requirements using specific templates: 'The" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D) Structure requirements using specific templates: 'The" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "ears-requirements/application-paraphrase", - "score": 0.75, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) Structure requirements using specific templates: 'The" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "B", - "correct": false, - "raw_response": "B) Create a comprehensive requirements specification document with functional" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C) Structure requirements using specific templates: 'The" - } - ] - }, - { - "label": "event-driven-architecture/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "event-driven-architecture/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "event-driven-architecture/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "fagan-inspection/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "fagan-inspection/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "fagan-inspection/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "feynman-technique/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "feynman-technique/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "feynman-technique/application-paraphrase", - "score": 0.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "C", - "correct": false, - "raw_response": "C)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "B", - "correct": false, - "raw_response": "B" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "A", - "correct": false, - "raw_response": "A" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "A", - "correct": false, - "raw_response": "A" - } - ] - }, - { - "label": "five-whys/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "five-whys/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "five-whys/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "fowler-patterns/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "fowler-patterns/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "fowler-patterns/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) Use a Domain Model pattern for complex business" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D) Use a Domain Model pattern for complex business" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C) Use a Domain Model pattern for complex business" - } - ] - }, - { - "label": "gherkin/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - } - ] - }, - { - "label": "gherkin/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "gherkin/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "github-flow/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) Workflow steps" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A) Workflow steps" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D) Workflow steps" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "github-flow/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "github-flow/application-paraphrase", - "score": 0.75, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "A", - "correct": false, - "raw_response": "A" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "gutes-deutsch-wolf-schneider/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "gutes-deutsch-wolf-schneider/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "gutes-deutsch-wolf-schneider/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "hexagonal-architecture/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "hexagonal-architecture/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "hexagonal-architecture/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "iec-61508-sil-levels/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "iec-61508-sil-levels/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "iec-61508-sil-levels/application-paraphrase", - "score": 0.75, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "D", - "correct": false, - "raw_response": "D" - } - ] - }, - { - "label": "impact-mapping/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D) Goal → Actors → Impacts → Deliver" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "impact-mapping/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) Map the retention goal to key actors (" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C) Map the retention goal to key actors (" - } - ] - }, - { - "label": "impact-mapping/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) Map the retention goal to key actors (" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "invest/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "invest/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "invest/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A) Split this into multiple smaller stories with specific" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D) Split this into multiple smaller stories with specific" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "iso-25010/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "iso-25010/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "iso-25010/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "jobs-to-be-done/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "jobs-to-be-done/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) Interview users about the specific circumstances that led" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "jobs-to-be-done/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) Interview users about the specific circumstances that led" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A) Interview users about the specific circumstances that led" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D) Interview users about the specific circumstances that led" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C) Interview users about the specific circumstances that led" - } - ] - }, - { - "label": "lasr/recognition", - "score": 0.75, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "A", - "correct": false, - "raw_response": "A" - } - ] - }, - { - "label": "lasr/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "lasr/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "linddun/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "linddun/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "linddun/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D) Systematically analyze the system against seven privacy" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C) Systematically analyze the system against seven privacy" - } - ] - }, - { - "label": "llm-evaluations/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "llm-evaluations/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "llm-evaluations/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "madr/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "madr/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A) Document the decision with sections for context," - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D) Document the decision with sections for context," - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "madr/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D) Document the decision with sections for context," - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "mece/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D) Structuring categories so they do not overlap" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "mece/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D) Organize by business capability: User Service" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C) Organize by business capability: User Service" - } - ] - }, - { - "label": "mece/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D) Organize by business capability: User Service" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C) Organize by business capability: User Service" - } - ] - }, - { - "label": "morphological-box/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) Break complex problem into independent parameters/dim" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D) Break complex problem into independent parameters/dim" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C) Break complex problem into independent parameters/dim" - } - ] - }, - { - "label": "morphological-box/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "morphological-box/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "moscow/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "moscow/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "moscow/application-paraphrase", - "score": 0.75, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "C", - "correct": false, - "raw_response": "C" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "mutation-testing/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "mutation-testing/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "mutation-testing/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "negative-control/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - } - ] - }, - { - "label": "nelson-rules/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "nelson-rules/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "nelson-rules/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "owasp-top-10/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - } - ] - }, - { - "label": "owasp-top-10/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "owasp-top-10/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "plain-english-strunk-white/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "plain-english-strunk-white/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "plain-english-strunk-white/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "prd/recognition", - "score": 0.75, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "A", - "correct": false, - "raw_response": "A" - } - ] - }, - { - "label": "prd/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "prd/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) Write a comprehensive document that defines the problem" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D) Write a comprehensive document that defines the problem" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "problem-space-nvc/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) Concrete, objective facts without evaluation or judgment" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "problem-space-nvc/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "problem-space-nvc/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "property-based-testing/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D) Invariants that should always hold; automatic" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "property-based-testing/application-anchor", - "score": 0.75, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) Define mathematical invariants like 'interest calculations" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "A", - "correct": false, - "raw_response": "A" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "property-based-testing/application-paraphrase", - "score": 0.75, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "D", - "correct": false, - "raw_response": "D) Write comprehensive unit tests covering typical financial scenarios" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D) Define mathematical invariants like 'interest calculations" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "pyramid-principle/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "pyramid-principle/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "pyramid-principle/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "sanity-check/recognition", - "score": 0.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "X", - "answer": "A", - "correct": false, - "raw_response": "None of the options provided are correct. The Answer" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "X", - "answer": "A", - "correct": false, - "raw_response": "None of the options provided correspond to the Answer to" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "X", - "answer": "A", - "correct": false, - "raw_response": "None of the options provided correspond to the Answer to" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "X", - "answer": "A", - "correct": false, - "raw_response": "None of the options provided correspond to the Answer to" - } - ] - }, - { - "label": "semantic-versioning/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "semantic-versioning/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A) 3.0.0 - because" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "semantic-versioning/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "socratic-method/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "socratic-method/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "socratic-method/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "sota/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "sota/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "sota/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "spc/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "spc/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "spc/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "stride/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "None of the options provided accurately describe the STRIDE" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "stride/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "stride/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "swot/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "swot/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C) Analyze internal strengths and weaknesses of your current" - } - ] - }, - { - "label": "swot/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) Analyze internal strengths and weaknesses of your current" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "tdd-chicago-school/recognition", - "score": 0.75, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "D", - "correct": false, - "raw_response": "D)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "tdd-chicago-school/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "tdd-chicago-school/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D) Begin with tests for the core pricing calculations" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "tdd-london-school/recognition", - "score": 0.75, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "D", - "correct": false, - "raw_response": "D" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "tdd-london-school/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) Write a test that mocks PaymentGateway and" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "tdd-london-school/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "tdd-london-school/consistency-variant-1", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) Steve Freeman" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A) Steve Freeman" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D) Steve Freeman" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C) Steve Freeman" - } - ] - }, - { - "label": "tdd-london-school/consistency-variant-2", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) Steve Freeman" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A) Steve Freeman" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D) Steve Freeman" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C) Steve Freeman" - } - ] - }, - { - "label": "tdd-london-school/consistency-variant-3", - "score": 0.5, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "C", - "correct": false, - "raw_response": "C) Dan North" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "B", - "correct": false, - "raw_response": "B)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D) Steve Freeman" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C) Steve Freeman" - } - ] - }, - { - "label": "tdd-london-school/consistency-language", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D) Steve Freeman" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "testing-pyramid/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) Three layers; more unit tests, fewer" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A) Three layers; more unit tests, fewer" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "testing-pyramid/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "testing-pyramid/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "timtowtdi/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - } - ] - }, - { - "label": "timtowtdi/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "timtowtdi/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "todotxt-flavoured-markdown/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "todotxt-flavoured-markdown/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "todotxt-flavoured-markdown/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "user-story-mapping/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "user-story-mapping/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "user-story-mapping/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A)" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "wardley-mapping/recognition", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C)" - } - ] - }, - { - "label": "wardley-mapping/application-anchor", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B)" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - }, - { - "label": "wardley-mapping/application-paraphrase", - "score": 1.0, - "results": [ - { - "permutation": [ - "A", - "B", - "C", - "D" - ], - "expected": "B", - "answer": "B", - "correct": true, - "raw_response": "B) Map the payment processing component's position on" - }, - { - "permutation": [ - "B", - "C", - "D", - "A" - ], - "expected": "A", - "answer": "A", - "correct": true, - "raw_response": "A" - }, - { - "permutation": [ - "C", - "D", - "A", - "B" - ], - "expected": "D", - "answer": "D", - "correct": true, - "raw_response": "D)" - }, - { - "permutation": [ - "D", - "A", - "B", - "C" - ], - "expected": "C", - "answer": "C", - "correct": true, - "raw_response": "C" - } - ] - } - ] - }, - "duration_seconds": 938.4 -} \ No newline at end of file diff --git a/evaluations/summaries/pilot-20260324-174404.json b/evaluations/summaries/pilot-20260324-174404.json new file mode 100644 index 0000000..c66b5d9 --- /dev/null +++ b/evaluations/summaries/pilot-20260324-174404.json @@ -0,0 +1,792 @@ +{ + "timestamp": "2026-03-24T17:44:04.891380+00:00", + "config": { + "models": [ + "claude" + ], + "openai_model": null, + "mistral_model": null, + "deepseek_model": null, + "ollama_model": null, + "ollama_url": null, + "no_think": null, + "temperature": 0.0 + }, + "models": { + "claude": [ + { + "label": "adr-according-to-nygard/recognition", + "score": 1.0 + }, + { + "label": "adr-according-to-nygard/application-anchor", + "score": 1.0 + }, + { + "label": "adr-according-to-nygard/application-paraphrase", + "score": 1.0 + }, + { + "label": "arc42/recognition", + "score": 1.0 + }, + { + "label": "arc42/application-anchor", + "score": 1.0 + }, + { + "label": "arc42/application-paraphrase", + "score": 1.0 + }, + { + "label": "arc42/consistency-variant-1", + "score": 1.0 + }, + { + "label": "arc42/consistency-variant-2", + "score": 1.0 + }, + { + "label": "arc42/consistency-variant-3", + "score": 1.0 + }, + { + "label": "arc42/consistency-language", + "score": 1.0 + }, + { + "label": "atam/recognition", + "score": 1.0 + }, + { + "label": "atam/application-anchor", + "score": 1.0 + }, + { + "label": "atam/application-paraphrase", + "score": 1.0 + }, + { + "label": "bdd-given-when-then/recognition", + "score": 1.0 + }, + { + "label": "bdd-given-when-then/application-anchor", + "score": 1.0 + }, + { + "label": "bdd-given-when-then/application-paraphrase", + "score": 1.0 + }, + { + "label": "bem-methodology/recognition", + "score": 1.0 + }, + { + "label": "bem-methodology/application-anchor", + "score": 1.0 + }, + { + "label": "bem-methodology/application-paraphrase", + "score": 1.0 + }, + { + "label": "bluf/recognition", + "score": 1.0 + }, + { + "label": "bluf/application-anchor", + "score": 1.0 + }, + { + "label": "bluf/application-paraphrase", + "score": 1.0 + }, + { + "label": "c4-diagrams/recognition", + "score": 1.0 + }, + { + "label": "c4-diagrams/application-anchor", + "score": 1.0 + }, + { + "label": "c4-diagrams/application-paraphrase", + "score": 1.0 + }, + { + "label": "chain-of-thought/recognition", + "score": 1.0 + }, + { + "label": "chain-of-thought/application-anchor", + "score": 1.0 + }, + { + "label": "chain-of-thought/application-paraphrase", + "score": 1.0 + }, + { + "label": "clean-architecture/recognition", + "score": 1.0 + }, + { + "label": "clean-architecture/application-anchor", + "score": 1.0 + }, + { + "label": "clean-architecture/application-paraphrase", + "score": 1.0 + }, + { + "label": "control-chart-shewhart/recognition", + "score": 1.0 + }, + { + "label": "control-chart-shewhart/application-anchor", + "score": 1.0 + }, + { + "label": "control-chart-shewhart/application-paraphrase", + "score": 1.0 + }, + { + "label": "conventional-commits/recognition", + "score": 1.0 + }, + { + "label": "conventional-commits/application-anchor", + "score": 1.0 + }, + { + "label": "conventional-commits/application-paraphrase", + "score": 1.0 + }, + { + "label": "cqrs/recognition", + "score": 1.0 + }, + { + "label": "cqrs/application-anchor", + "score": 1.0 + }, + { + "label": "cqrs/application-paraphrase", + "score": 1.0 + }, + { + "label": "cynefin-framework/recognition", + "score": 1.0 + }, + { + "label": "cynefin-framework/application-anchor", + "score": 1.0 + }, + { + "label": "cynefin-framework/application-paraphrase", + "score": 1.0 + }, + { + "label": "definition-of-done/recognition", + "score": 1.0 + }, + { + "label": "definition-of-done/application-anchor", + "score": 1.0 + }, + { + "label": "definition-of-done/application-paraphrase", + "score": 1.0 + }, + { + "label": "devils-advocate/recognition", + "score": 1.0 + }, + { + "label": "devils-advocate/application-anchor", + "score": 1.0 + }, + { + "label": "devils-advocate/application-paraphrase", + "score": 1.0 + }, + { + "label": "diataxis-framework/recognition", + "score": 1.0 + }, + { + "label": "diataxis-framework/application-anchor", + "score": 1.0 + }, + { + "label": "diataxis-framework/application-paraphrase", + "score": 1.0 + }, + { + "label": "docs-as-code/recognition", + "score": 1.0 + }, + { + "label": "docs-as-code/application-anchor", + "score": 1.0 + }, + { + "label": "docs-as-code/application-paraphrase", + "score": 1.0 + }, + { + "label": "domain-driven-design/recognition", + "score": 1.0 + }, + { + "label": "domain-driven-design/application-anchor", + "score": 1.0 + }, + { + "label": "domain-driven-design/application-paraphrase", + "score": 1.0 + }, + { + "label": "ears-requirements/recognition", + "score": 1.0 + }, + { + "label": "ears-requirements/application-anchor", + "score": 1.0 + }, + { + "label": "ears-requirements/application-paraphrase", + "score": 1.0 + }, + { + "label": "event-driven-architecture/recognition", + "score": 1.0 + }, + { + "label": "event-driven-architecture/application-anchor", + "score": 1.0 + }, + { + "label": "event-driven-architecture/application-paraphrase", + "score": 1.0 + }, + { + "label": "fagan-inspection/recognition", + "score": 1.0 + }, + { + "label": "fagan-inspection/application-anchor", + "score": 1.0 + }, + { + "label": "fagan-inspection/application-paraphrase", + "score": 1.0 + }, + { + "label": "feynman-technique/recognition", + "score": 1.0 + }, + { + "label": "feynman-technique/application-anchor", + "score": 1.0 + }, + { + "label": "feynman-technique/application-paraphrase", + "score": 0.0 + }, + { + "label": "five-whys/recognition", + "score": 1.0 + }, + { + "label": "five-whys/application-anchor", + "score": 1.0 + }, + { + "label": "five-whys/application-paraphrase", + "score": 1.0 + }, + { + "label": "fowler-patterns/recognition", + "score": 1.0 + }, + { + "label": "fowler-patterns/application-anchor", + "score": 1.0 + }, + { + "label": "fowler-patterns/application-paraphrase", + "score": 1.0 + }, + { + "label": "gherkin/recognition", + "score": 1.0 + }, + { + "label": "gherkin/application-anchor", + "score": 1.0 + }, + { + "label": "gherkin/application-paraphrase", + "score": 1.0 + }, + { + "label": "github-flow/recognition", + "score": 1.0 + }, + { + "label": "github-flow/application-anchor", + "score": 1.0 + }, + { + "label": "github-flow/application-paraphrase", + "score": 0.75 + }, + { + "label": "gutes-deutsch-wolf-schneider/recognition", + "score": 1.0 + }, + { + "label": "gutes-deutsch-wolf-schneider/application-anchor", + "score": 1.0 + }, + { + "label": "gutes-deutsch-wolf-schneider/application-paraphrase", + "score": 1.0 + }, + { + "label": "hexagonal-architecture/recognition", + "score": 1.0 + }, + { + "label": "hexagonal-architecture/application-anchor", + "score": 1.0 + }, + { + "label": "hexagonal-architecture/application-paraphrase", + "score": 1.0 + }, + { + "label": "iec-61508-sil-levels/recognition", + "score": 1.0 + }, + { + "label": "iec-61508-sil-levels/application-anchor", + "score": 1.0 + }, + { + "label": "iec-61508-sil-levels/application-paraphrase", + "score": 1.0 + }, + { + "label": "impact-mapping/recognition", + "score": 1.0 + }, + { + "label": "impact-mapping/application-anchor", + "score": 1.0 + }, + { + "label": "impact-mapping/application-paraphrase", + "score": 1.0 + }, + { + "label": "invest/recognition", + "score": 1.0 + }, + { + "label": "invest/application-anchor", + "score": 1.0 + }, + { + "label": "invest/application-paraphrase", + "score": 1.0 + }, + { + "label": "iso-25010/recognition", + "score": 1.0 + }, + { + "label": "iso-25010/application-anchor", + "score": 1.0 + }, + { + "label": "iso-25010/application-paraphrase", + "score": 1.0 + }, + { + "label": "jobs-to-be-done/recognition", + "score": 1.0 + }, + { + "label": "jobs-to-be-done/application-anchor", + "score": 1.0 + }, + { + "label": "jobs-to-be-done/application-paraphrase", + "score": 1.0 + }, + { + "label": "lasr/recognition", + "score": 1.0 + }, + { + "label": "lasr/application-anchor", + "score": 1.0 + }, + { + "label": "lasr/application-paraphrase", + "score": 1.0 + }, + { + "label": "linddun/recognition", + "score": 1.0 + }, + { + "label": "linddun/application-anchor", + "score": 1.0 + }, + { + "label": "linddun/application-paraphrase", + "score": 1.0 + }, + { + "label": "llm-evaluations/recognition", + "score": 1.0 + }, + { + "label": "llm-evaluations/application-anchor", + "score": 1.0 + }, + { + "label": "llm-evaluations/application-paraphrase", + "score": 1.0 + }, + { + "label": "madr/recognition", + "score": 1.0 + }, + { + "label": "madr/application-anchor", + "score": 1.0 + }, + { + "label": "madr/application-paraphrase", + "score": 1.0 + }, + { + "label": "mece/recognition", + "score": 1.0 + }, + { + "label": "mece/application-anchor", + "score": 1.0 + }, + { + "label": "mece/application-paraphrase", + "score": 1.0 + }, + { + "label": "morphological-box/recognition", + "score": 1.0 + }, + { + "label": "morphological-box/application-anchor", + "score": 1.0 + }, + { + "label": "morphological-box/application-paraphrase", + "score": 1.0 + }, + { + "label": "moscow/recognition", + "score": 1.0 + }, + { + "label": "moscow/application-anchor", + "score": 1.0 + }, + { + "label": "moscow/application-paraphrase", + "score": 1.0 + }, + { + "label": "mutation-testing/recognition", + "score": 1.0 + }, + { + "label": "mutation-testing/application-anchor", + "score": 1.0 + }, + { + "label": "mutation-testing/application-paraphrase", + "score": 1.0 + }, + { + "label": "negative-control/recognition", + "score": 1.0 + }, + { + "label": "nelson-rules/recognition", + "score": 1.0 + }, + { + "label": "nelson-rules/application-anchor", + "score": 1.0 + }, + { + "label": "nelson-rules/application-paraphrase", + "score": 1.0 + }, + { + "label": "owasp-top-10/recognition", + "score": 1.0 + }, + { + "label": "owasp-top-10/application-anchor", + "score": 1.0 + }, + { + "label": "owasp-top-10/application-paraphrase", + "score": 1.0 + }, + { + "label": "plain-english-strunk-white/recognition", + "score": 1.0 + }, + { + "label": "plain-english-strunk-white/application-anchor", + "score": 1.0 + }, + { + "label": "plain-english-strunk-white/application-paraphrase", + "score": 1.0 + }, + { + "label": "prd/recognition", + "score": 1.0 + }, + { + "label": "prd/application-anchor", + "score": 1.0 + }, + { + "label": "prd/application-paraphrase", + "score": 1.0 + }, + { + "label": "problem-space-nvc/recognition", + "score": 1.0 + }, + { + "label": "problem-space-nvc/application-anchor", + "score": 1.0 + }, + { + "label": "problem-space-nvc/application-paraphrase", + "score": 1.0 + }, + { + "label": "property-based-testing/recognition", + "score": 1.0 + }, + { + "label": "property-based-testing/application-anchor", + "score": 1.0 + }, + { + "label": "property-based-testing/application-paraphrase", + "score": 1.0 + }, + { + "label": "pyramid-principle/recognition", + "score": 1.0 + }, + { + "label": "pyramid-principle/application-anchor", + "score": 1.0 + }, + { + "label": "pyramid-principle/application-paraphrase", + "score": 1.0 + }, + { + "label": "sanity-check/recognition", + "score": 0.0 + }, + { + "label": "semantic-versioning/recognition", + "score": 1.0 + }, + { + "label": "semantic-versioning/application-anchor", + "score": 1.0 + }, + { + "label": "semantic-versioning/application-paraphrase", + "score": 1.0 + }, + { + "label": "socratic-method/recognition", + "score": 1.0 + }, + { + "label": "socratic-method/application-anchor", + "score": 1.0 + }, + { + "label": "socratic-method/application-paraphrase", + "score": 1.0 + }, + { + "label": "sota/recognition", + "score": 1.0 + }, + { + "label": "sota/application-anchor", + "score": 1.0 + }, + { + "label": "sota/application-paraphrase", + "score": 1.0 + }, + { + "label": "spc/recognition", + "score": 1.0 + }, + { + "label": "spc/application-anchor", + "score": 1.0 + }, + { + "label": "spc/application-paraphrase", + "score": 1.0 + }, + { + "label": "stride/recognition", + "score": 1.0 + }, + { + "label": "stride/application-anchor", + "score": 1.0 + }, + { + "label": "stride/application-paraphrase", + "score": 1.0 + }, + { + "label": "swot/recognition", + "score": 1.0 + }, + { + "label": "swot/application-anchor", + "score": 1.0 + }, + { + "label": "swot/application-paraphrase", + "score": 1.0 + }, + { + "label": "tdd-chicago-school/recognition", + "score": 1.0 + }, + { + "label": "tdd-chicago-school/application-anchor", + "score": 1.0 + }, + { + "label": "tdd-chicago-school/application-paraphrase", + "score": 1.0 + }, + { + "label": "tdd-london-school/recognition", + "score": 1.0 + }, + { + "label": "tdd-london-school/application-anchor", + "score": 1.0 + }, + { + "label": "tdd-london-school/application-paraphrase", + "score": 1.0 + }, + { + "label": "tdd-london-school/consistency-variant-1", + "score": 1.0 + }, + { + "label": "tdd-london-school/consistency-variant-2", + "score": 1.0 + }, + { + "label": "tdd-london-school/consistency-variant-3", + "score": 1.0 + }, + { + "label": "tdd-london-school/consistency-language", + "score": 1.0 + }, + { + "label": "testing-pyramid/recognition", + "score": 1.0 + }, + { + "label": "testing-pyramid/application-anchor", + "score": 1.0 + }, + { + "label": "testing-pyramid/application-paraphrase", + "score": 1.0 + }, + { + "label": "timtowtdi/recognition", + "score": 1.0 + }, + { + "label": "timtowtdi/application-anchor", + "score": 1.0 + }, + { + "label": "timtowtdi/application-paraphrase", + "score": 1.0 + }, + { + "label": "todotxt-flavoured-markdown/recognition", + "score": 1.0 + }, + { + "label": "todotxt-flavoured-markdown/application-anchor", + "score": 1.0 + }, + { + "label": "todotxt-flavoured-markdown/application-paraphrase", + "score": 1.0 + }, + { + "label": "user-story-mapping/recognition", + "score": 1.0 + }, + { + "label": "user-story-mapping/application-anchor", + "score": 1.0 + }, + { + "label": "user-story-mapping/application-paraphrase", + "score": 1.0 + }, + { + "label": "wardley-mapping/recognition", + "score": 1.0 + }, + { + "label": "wardley-mapping/application-anchor", + "score": 1.0 + }, + { + "label": "wardley-mapping/application-paraphrase", + "score": 1.0 + } + ] + }, + "duration_seconds": 4862.1 +} \ No newline at end of file diff --git a/evaluations/summaries/pilot-20260324-190600.json b/evaluations/summaries/pilot-20260324-190600.json new file mode 100644 index 0000000..075b200 --- /dev/null +++ b/evaluations/summaries/pilot-20260324-190600.json @@ -0,0 +1,792 @@ +{ + "timestamp": "2026-03-24T19:06:00.394684+00:00", + "config": { + "models": [ + "mistral" + ], + "openai_model": null, + "mistral_model": "mistral-large-2512", + "deepseek_model": null, + "ollama_model": null, + "ollama_url": null, + "no_think": null, + "temperature": 0.0 + }, + "models": { + "mistral": [ + { + "label": "adr-according-to-nygard/recognition", + "score": 0.75 + }, + { + "label": "adr-according-to-nygard/application-anchor", + "score": 1.0 + }, + { + "label": "adr-according-to-nygard/application-paraphrase", + "score": 1.0 + }, + { + "label": "arc42/recognition", + "score": 1.0 + }, + { + "label": "arc42/application-anchor", + "score": 1.0 + }, + { + "label": "arc42/application-paraphrase", + "score": 1.0 + }, + { + "label": "arc42/consistency-variant-1", + "score": 1.0 + }, + { + "label": "arc42/consistency-variant-2", + "score": 1.0 + }, + { + "label": "arc42/consistency-variant-3", + "score": 1.0 + }, + { + "label": "arc42/consistency-language", + "score": 1.0 + }, + { + "label": "atam/recognition", + "score": 1.0 + }, + { + "label": "atam/application-anchor", + "score": 1.0 + }, + { + "label": "atam/application-paraphrase", + "score": 1.0 + }, + { + "label": "bdd-given-when-then/recognition", + "score": 1.0 + }, + { + "label": "bdd-given-when-then/application-anchor", + "score": 1.0 + }, + { + "label": "bdd-given-when-then/application-paraphrase", + "score": 0.5 + }, + { + "label": "bem-methodology/recognition", + "score": 1.0 + }, + { + "label": "bem-methodology/application-anchor", + "score": 1.0 + }, + { + "label": "bem-methodology/application-paraphrase", + "score": 1.0 + }, + { + "label": "bluf/recognition", + "score": 1.0 + }, + { + "label": "bluf/application-anchor", + "score": 1.0 + }, + { + "label": "bluf/application-paraphrase", + "score": 1.0 + }, + { + "label": "c4-diagrams/recognition", + "score": 1.0 + }, + { + "label": "c4-diagrams/application-anchor", + "score": 1.0 + }, + { + "label": "c4-diagrams/application-paraphrase", + "score": 1.0 + }, + { + "label": "chain-of-thought/recognition", + "score": 1.0 + }, + { + "label": "chain-of-thought/application-anchor", + "score": 1.0 + }, + { + "label": "chain-of-thought/application-paraphrase", + "score": 1.0 + }, + { + "label": "clean-architecture/recognition", + "score": 1.0 + }, + { + "label": "clean-architecture/application-anchor", + "score": 1.0 + }, + { + "label": "clean-architecture/application-paraphrase", + "score": 1.0 + }, + { + "label": "control-chart-shewhart/recognition", + "score": 1.0 + }, + { + "label": "control-chart-shewhart/application-anchor", + "score": 1.0 + }, + { + "label": "control-chart-shewhart/application-paraphrase", + "score": 1.0 + }, + { + "label": "conventional-commits/recognition", + "score": 1.0 + }, + { + "label": "conventional-commits/application-anchor", + "score": 1.0 + }, + { + "label": "conventional-commits/application-paraphrase", + "score": 1.0 + }, + { + "label": "cqrs/recognition", + "score": 1.0 + }, + { + "label": "cqrs/application-anchor", + "score": 1.0 + }, + { + "label": "cqrs/application-paraphrase", + "score": 1.0 + }, + { + "label": "cynefin-framework/recognition", + "score": 1.0 + }, + { + "label": "cynefin-framework/application-anchor", + "score": 1.0 + }, + { + "label": "cynefin-framework/application-paraphrase", + "score": 1.0 + }, + { + "label": "definition-of-done/recognition", + "score": 1.0 + }, + { + "label": "definition-of-done/application-anchor", + "score": 1.0 + }, + { + "label": "definition-of-done/application-paraphrase", + "score": 1.0 + }, + { + "label": "devils-advocate/recognition", + "score": 1.0 + }, + { + "label": "devils-advocate/application-anchor", + "score": 1.0 + }, + { + "label": "devils-advocate/application-paraphrase", + "score": 1.0 + }, + { + "label": "diataxis-framework/recognition", + "score": 1.0 + }, + { + "label": "diataxis-framework/application-anchor", + "score": 1.0 + }, + { + "label": "diataxis-framework/application-paraphrase", + "score": 1.0 + }, + { + "label": "docs-as-code/recognition", + "score": 1.0 + }, + { + "label": "docs-as-code/application-anchor", + "score": 1.0 + }, + { + "label": "docs-as-code/application-paraphrase", + "score": 1.0 + }, + { + "label": "domain-driven-design/recognition", + "score": 1.0 + }, + { + "label": "domain-driven-design/application-anchor", + "score": 1.0 + }, + { + "label": "domain-driven-design/application-paraphrase", + "score": 1.0 + }, + { + "label": "ears-requirements/recognition", + "score": 0.75 + }, + { + "label": "ears-requirements/application-anchor", + "score": 1.0 + }, + { + "label": "ears-requirements/application-paraphrase", + "score": 0.75 + }, + { + "label": "event-driven-architecture/recognition", + "score": 1.0 + }, + { + "label": "event-driven-architecture/application-anchor", + "score": 1.0 + }, + { + "label": "event-driven-architecture/application-paraphrase", + "score": 1.0 + }, + { + "label": "fagan-inspection/recognition", + "score": 1.0 + }, + { + "label": "fagan-inspection/application-anchor", + "score": 1.0 + }, + { + "label": "fagan-inspection/application-paraphrase", + "score": 1.0 + }, + { + "label": "feynman-technique/recognition", + "score": 1.0 + }, + { + "label": "feynman-technique/application-anchor", + "score": 1.0 + }, + { + "label": "feynman-technique/application-paraphrase", + "score": 0.75 + }, + { + "label": "five-whys/recognition", + "score": 1.0 + }, + { + "label": "five-whys/application-anchor", + "score": 1.0 + }, + { + "label": "five-whys/application-paraphrase", + "score": 1.0 + }, + { + "label": "fowler-patterns/recognition", + "score": 1.0 + }, + { + "label": "fowler-patterns/application-anchor", + "score": 1.0 + }, + { + "label": "fowler-patterns/application-paraphrase", + "score": 1.0 + }, + { + "label": "gherkin/recognition", + "score": 1.0 + }, + { + "label": "gherkin/application-anchor", + "score": 1.0 + }, + { + "label": "gherkin/application-paraphrase", + "score": 1.0 + }, + { + "label": "github-flow/recognition", + "score": 1.0 + }, + { + "label": "github-flow/application-anchor", + "score": 1.0 + }, + { + "label": "github-flow/application-paraphrase", + "score": 0.75 + }, + { + "label": "gutes-deutsch-wolf-schneider/recognition", + "score": 1.0 + }, + { + "label": "gutes-deutsch-wolf-schneider/application-anchor", + "score": 1.0 + }, + { + "label": "gutes-deutsch-wolf-schneider/application-paraphrase", + "score": 1.0 + }, + { + "label": "hexagonal-architecture/recognition", + "score": 1.0 + }, + { + "label": "hexagonal-architecture/application-anchor", + "score": 1.0 + }, + { + "label": "hexagonal-architecture/application-paraphrase", + "score": 1.0 + }, + { + "label": "iec-61508-sil-levels/recognition", + "score": 1.0 + }, + { + "label": "iec-61508-sil-levels/application-anchor", + "score": 0.5 + }, + { + "label": "iec-61508-sil-levels/application-paraphrase", + "score": 1.0 + }, + { + "label": "impact-mapping/recognition", + "score": 1.0 + }, + { + "label": "impact-mapping/application-anchor", + "score": 1.0 + }, + { + "label": "impact-mapping/application-paraphrase", + "score": 1.0 + }, + { + "label": "invest/recognition", + "score": 1.0 + }, + { + "label": "invest/application-anchor", + "score": 1.0 + }, + { + "label": "invest/application-paraphrase", + "score": 1.0 + }, + { + "label": "iso-25010/recognition", + "score": 1.0 + }, + { + "label": "iso-25010/application-anchor", + "score": 0.75 + }, + { + "label": "iso-25010/application-paraphrase", + "score": 0.75 + }, + { + "label": "jobs-to-be-done/recognition", + "score": 1.0 + }, + { + "label": "jobs-to-be-done/application-anchor", + "score": 1.0 + }, + { + "label": "jobs-to-be-done/application-paraphrase", + "score": 1.0 + }, + { + "label": "lasr/recognition", + "score": 0.25 + }, + { + "label": "lasr/application-anchor", + "score": 1.0 + }, + { + "label": "lasr/application-paraphrase", + "score": 1.0 + }, + { + "label": "linddun/recognition", + "score": 1.0 + }, + { + "label": "linddun/application-anchor", + "score": 1.0 + }, + { + "label": "linddun/application-paraphrase", + "score": 1.0 + }, + { + "label": "llm-evaluations/recognition", + "score": 1.0 + }, + { + "label": "llm-evaluations/application-anchor", + "score": 1.0 + }, + { + "label": "llm-evaluations/application-paraphrase", + "score": 1.0 + }, + { + "label": "madr/recognition", + "score": 1.0 + }, + { + "label": "madr/application-anchor", + "score": 1.0 + }, + { + "label": "madr/application-paraphrase", + "score": 1.0 + }, + { + "label": "mece/recognition", + "score": 1.0 + }, + { + "label": "mece/application-anchor", + "score": 1.0 + }, + { + "label": "mece/application-paraphrase", + "score": 1.0 + }, + { + "label": "morphological-box/recognition", + "score": 1.0 + }, + { + "label": "morphological-box/application-anchor", + "score": 1.0 + }, + { + "label": "morphological-box/application-paraphrase", + "score": 1.0 + }, + { + "label": "moscow/recognition", + "score": 1.0 + }, + { + "label": "moscow/application-anchor", + "score": 1.0 + }, + { + "label": "moscow/application-paraphrase", + "score": 0.25 + }, + { + "label": "mutation-testing/recognition", + "score": 1.0 + }, + { + "label": "mutation-testing/application-anchor", + "score": 1.0 + }, + { + "label": "mutation-testing/application-paraphrase", + "score": 1.0 + }, + { + "label": "negative-control/recognition", + "score": 0.75 + }, + { + "label": "nelson-rules/recognition", + "score": 1.0 + }, + { + "label": "nelson-rules/application-anchor", + "score": 1.0 + }, + { + "label": "nelson-rules/application-paraphrase", + "score": 1.0 + }, + { + "label": "owasp-top-10/recognition", + "score": 1.0 + }, + { + "label": "owasp-top-10/application-anchor", + "score": 1.0 + }, + { + "label": "owasp-top-10/application-paraphrase", + "score": 1.0 + }, + { + "label": "plain-english-strunk-white/recognition", + "score": 1.0 + }, + { + "label": "plain-english-strunk-white/application-anchor", + "score": 1.0 + }, + { + "label": "plain-english-strunk-white/application-paraphrase", + "score": 1.0 + }, + { + "label": "prd/recognition", + "score": 0.0 + }, + { + "label": "prd/application-anchor", + "score": 1.0 + }, + { + "label": "prd/application-paraphrase", + "score": 1.0 + }, + { + "label": "problem-space-nvc/recognition", + "score": 1.0 + }, + { + "label": "problem-space-nvc/application-anchor", + "score": 0.75 + }, + { + "label": "problem-space-nvc/application-paraphrase", + "score": 0.75 + }, + { + "label": "property-based-testing/recognition", + "score": 1.0 + }, + { + "label": "property-based-testing/application-anchor", + "score": 1.0 + }, + { + "label": "property-based-testing/application-paraphrase", + "score": 1.0 + }, + { + "label": "pyramid-principle/recognition", + "score": 1.0 + }, + { + "label": "pyramid-principle/application-anchor", + "score": 1.0 + }, + { + "label": "pyramid-principle/application-paraphrase", + "score": 1.0 + }, + { + "label": "sanity-check/recognition", + "score": 0.0 + }, + { + "label": "semantic-versioning/recognition", + "score": 1.0 + }, + { + "label": "semantic-versioning/application-anchor", + "score": 0.5 + }, + { + "label": "semantic-versioning/application-paraphrase", + "score": 0.75 + }, + { + "label": "socratic-method/recognition", + "score": 1.0 + }, + { + "label": "socratic-method/application-anchor", + "score": 1.0 + }, + { + "label": "socratic-method/application-paraphrase", + "score": 1.0 + }, + { + "label": "sota/recognition", + "score": 1.0 + }, + { + "label": "sota/application-anchor", + "score": 1.0 + }, + { + "label": "sota/application-paraphrase", + "score": 1.0 + }, + { + "label": "spc/recognition", + "score": 1.0 + }, + { + "label": "spc/application-anchor", + "score": 1.0 + }, + { + "label": "spc/application-paraphrase", + "score": 1.0 + }, + { + "label": "stride/recognition", + "score": 1.0 + }, + { + "label": "stride/application-anchor", + "score": 1.0 + }, + { + "label": "stride/application-paraphrase", + "score": 1.0 + }, + { + "label": "swot/recognition", + "score": 1.0 + }, + { + "label": "swot/application-anchor", + "score": 1.0 + }, + { + "label": "swot/application-paraphrase", + "score": 1.0 + }, + { + "label": "tdd-chicago-school/recognition", + "score": 1.0 + }, + { + "label": "tdd-chicago-school/application-anchor", + "score": 1.0 + }, + { + "label": "tdd-chicago-school/application-paraphrase", + "score": 1.0 + }, + { + "label": "tdd-london-school/recognition", + "score": 1.0 + }, + { + "label": "tdd-london-school/application-anchor", + "score": 1.0 + }, + { + "label": "tdd-london-school/application-paraphrase", + "score": 1.0 + }, + { + "label": "tdd-london-school/consistency-variant-1", + "score": 1.0 + }, + { + "label": "tdd-london-school/consistency-variant-2", + "score": 1.0 + }, + { + "label": "tdd-london-school/consistency-variant-3", + "score": 1.0 + }, + { + "label": "tdd-london-school/consistency-language", + "score": 1.0 + }, + { + "label": "testing-pyramid/recognition", + "score": 1.0 + }, + { + "label": "testing-pyramid/application-anchor", + "score": 1.0 + }, + { + "label": "testing-pyramid/application-paraphrase", + "score": 1.0 + }, + { + "label": "timtowtdi/recognition", + "score": 1.0 + }, + { + "label": "timtowtdi/application-anchor", + "score": 1.0 + }, + { + "label": "timtowtdi/application-paraphrase", + "score": 1.0 + }, + { + "label": "todotxt-flavoured-markdown/recognition", + "score": 0.5 + }, + { + "label": "todotxt-flavoured-markdown/application-anchor", + "score": 1.0 + }, + { + "label": "todotxt-flavoured-markdown/application-paraphrase", + "score": 1.0 + }, + { + "label": "user-story-mapping/recognition", + "score": 1.0 + }, + { + "label": "user-story-mapping/application-anchor", + "score": 1.0 + }, + { + "label": "user-story-mapping/application-paraphrase", + "score": 1.0 + }, + { + "label": "wardley-mapping/recognition", + "score": 1.0 + }, + { + "label": "wardley-mapping/application-anchor", + "score": 1.0 + }, + { + "label": "wardley-mapping/application-paraphrase", + "score": 1.0 + } + ] + }, + "duration_seconds": 1018.4 +} \ No newline at end of file diff --git a/evaluations/summaries/pilot-20260324-192413.json b/evaluations/summaries/pilot-20260324-192413.json new file mode 100644 index 0000000..5312994 --- /dev/null +++ b/evaluations/summaries/pilot-20260324-192413.json @@ -0,0 +1,792 @@ +{ + "timestamp": "2026-03-24T19:24:13.551875+00:00", + "config": { + "models": [ + "openai" + ], + "openai_model": "gpt-4o", + "mistral_model": null, + "deepseek_model": null, + "ollama_model": null, + "ollama_url": null, + "no_think": null, + "temperature": 0.0 + }, + "models": { + "openai": [ + { + "label": "adr-according-to-nygard/recognition", + "score": 1.0 + }, + { + "label": "adr-according-to-nygard/application-anchor", + "score": 1.0 + }, + { + "label": "adr-according-to-nygard/application-paraphrase", + "score": 1.0 + }, + { + "label": "arc42/recognition", + "score": 1.0 + }, + { + "label": "arc42/application-anchor", + "score": 1.0 + }, + { + "label": "arc42/application-paraphrase", + "score": 1.0 + }, + { + "label": "arc42/consistency-variant-1", + "score": 1.0 + }, + { + "label": "arc42/consistency-variant-2", + "score": 1.0 + }, + { + "label": "arc42/consistency-variant-3", + "score": 1.0 + }, + { + "label": "arc42/consistency-language", + "score": 1.0 + }, + { + "label": "atam/recognition", + "score": 1.0 + }, + { + "label": "atam/application-anchor", + "score": 1.0 + }, + { + "label": "atam/application-paraphrase", + "score": 1.0 + }, + { + "label": "bdd-given-when-then/recognition", + "score": 1.0 + }, + { + "label": "bdd-given-when-then/application-anchor", + "score": 1.0 + }, + { + "label": "bdd-given-when-then/application-paraphrase", + "score": 1.0 + }, + { + "label": "bem-methodology/recognition", + "score": 1.0 + }, + { + "label": "bem-methodology/application-anchor", + "score": 1.0 + }, + { + "label": "bem-methodology/application-paraphrase", + "score": 1.0 + }, + { + "label": "bluf/recognition", + "score": 1.0 + }, + { + "label": "bluf/application-anchor", + "score": 1.0 + }, + { + "label": "bluf/application-paraphrase", + "score": 1.0 + }, + { + "label": "c4-diagrams/recognition", + "score": 1.0 + }, + { + "label": "c4-diagrams/application-anchor", + "score": 1.0 + }, + { + "label": "c4-diagrams/application-paraphrase", + "score": 1.0 + }, + { + "label": "chain-of-thought/recognition", + "score": 1.0 + }, + { + "label": "chain-of-thought/application-anchor", + "score": 1.0 + }, + { + "label": "chain-of-thought/application-paraphrase", + "score": 1.0 + }, + { + "label": "clean-architecture/recognition", + "score": 1.0 + }, + { + "label": "clean-architecture/application-anchor", + "score": 1.0 + }, + { + "label": "clean-architecture/application-paraphrase", + "score": 1.0 + }, + { + "label": "control-chart-shewhart/recognition", + "score": 1.0 + }, + { + "label": "control-chart-shewhart/application-anchor", + "score": 1.0 + }, + { + "label": "control-chart-shewhart/application-paraphrase", + "score": 0.75 + }, + { + "label": "conventional-commits/recognition", + "score": 1.0 + }, + { + "label": "conventional-commits/application-anchor", + "score": 1.0 + }, + { + "label": "conventional-commits/application-paraphrase", + "score": 1.0 + }, + { + "label": "cqrs/recognition", + "score": 1.0 + }, + { + "label": "cqrs/application-anchor", + "score": 1.0 + }, + { + "label": "cqrs/application-paraphrase", + "score": 1.0 + }, + { + "label": "cynefin-framework/recognition", + "score": 1.0 + }, + { + "label": "cynefin-framework/application-anchor", + "score": 1.0 + }, + { + "label": "cynefin-framework/application-paraphrase", + "score": 1.0 + }, + { + "label": "definition-of-done/recognition", + "score": 1.0 + }, + { + "label": "definition-of-done/application-anchor", + "score": 1.0 + }, + { + "label": "definition-of-done/application-paraphrase", + "score": 1.0 + }, + { + "label": "devils-advocate/recognition", + "score": 1.0 + }, + { + "label": "devils-advocate/application-anchor", + "score": 1.0 + }, + { + "label": "devils-advocate/application-paraphrase", + "score": 1.0 + }, + { + "label": "diataxis-framework/recognition", + "score": 1.0 + }, + { + "label": "diataxis-framework/application-anchor", + "score": 1.0 + }, + { + "label": "diataxis-framework/application-paraphrase", + "score": 1.0 + }, + { + "label": "docs-as-code/recognition", + "score": 1.0 + }, + { + "label": "docs-as-code/application-anchor", + "score": 1.0 + }, + { + "label": "docs-as-code/application-paraphrase", + "score": 1.0 + }, + { + "label": "domain-driven-design/recognition", + "score": 1.0 + }, + { + "label": "domain-driven-design/application-anchor", + "score": 1.0 + }, + { + "label": "domain-driven-design/application-paraphrase", + "score": 1.0 + }, + { + "label": "ears-requirements/recognition", + "score": 1.0 + }, + { + "label": "ears-requirements/application-anchor", + "score": 1.0 + }, + { + "label": "ears-requirements/application-paraphrase", + "score": 0.75 + }, + { + "label": "event-driven-architecture/recognition", + "score": 1.0 + }, + { + "label": "event-driven-architecture/application-anchor", + "score": 1.0 + }, + { + "label": "event-driven-architecture/application-paraphrase", + "score": 1.0 + }, + { + "label": "fagan-inspection/recognition", + "score": 1.0 + }, + { + "label": "fagan-inspection/application-anchor", + "score": 1.0 + }, + { + "label": "fagan-inspection/application-paraphrase", + "score": 1.0 + }, + { + "label": "feynman-technique/recognition", + "score": 1.0 + }, + { + "label": "feynman-technique/application-anchor", + "score": 1.0 + }, + { + "label": "feynman-technique/application-paraphrase", + "score": 0.0 + }, + { + "label": "five-whys/recognition", + "score": 1.0 + }, + { + "label": "five-whys/application-anchor", + "score": 1.0 + }, + { + "label": "five-whys/application-paraphrase", + "score": 1.0 + }, + { + "label": "fowler-patterns/recognition", + "score": 1.0 + }, + { + "label": "fowler-patterns/application-anchor", + "score": 1.0 + }, + { + "label": "fowler-patterns/application-paraphrase", + "score": 1.0 + }, + { + "label": "gherkin/recognition", + "score": 1.0 + }, + { + "label": "gherkin/application-anchor", + "score": 1.0 + }, + { + "label": "gherkin/application-paraphrase", + "score": 1.0 + }, + { + "label": "github-flow/recognition", + "score": 1.0 + }, + { + "label": "github-flow/application-anchor", + "score": 1.0 + }, + { + "label": "github-flow/application-paraphrase", + "score": 0.75 + }, + { + "label": "gutes-deutsch-wolf-schneider/recognition", + "score": 1.0 + }, + { + "label": "gutes-deutsch-wolf-schneider/application-anchor", + "score": 1.0 + }, + { + "label": "gutes-deutsch-wolf-schneider/application-paraphrase", + "score": 1.0 + }, + { + "label": "hexagonal-architecture/recognition", + "score": 1.0 + }, + { + "label": "hexagonal-architecture/application-anchor", + "score": 1.0 + }, + { + "label": "hexagonal-architecture/application-paraphrase", + "score": 1.0 + }, + { + "label": "iec-61508-sil-levels/recognition", + "score": 1.0 + }, + { + "label": "iec-61508-sil-levels/application-anchor", + "score": 1.0 + }, + { + "label": "iec-61508-sil-levels/application-paraphrase", + "score": 0.75 + }, + { + "label": "impact-mapping/recognition", + "score": 1.0 + }, + { + "label": "impact-mapping/application-anchor", + "score": 1.0 + }, + { + "label": "impact-mapping/application-paraphrase", + "score": 1.0 + }, + { + "label": "invest/recognition", + "score": 1.0 + }, + { + "label": "invest/application-anchor", + "score": 1.0 + }, + { + "label": "invest/application-paraphrase", + "score": 1.0 + }, + { + "label": "iso-25010/recognition", + "score": 1.0 + }, + { + "label": "iso-25010/application-anchor", + "score": 1.0 + }, + { + "label": "iso-25010/application-paraphrase", + "score": 1.0 + }, + { + "label": "jobs-to-be-done/recognition", + "score": 1.0 + }, + { + "label": "jobs-to-be-done/application-anchor", + "score": 1.0 + }, + { + "label": "jobs-to-be-done/application-paraphrase", + "score": 1.0 + }, + { + "label": "lasr/recognition", + "score": 0.75 + }, + { + "label": "lasr/application-anchor", + "score": 1.0 + }, + { + "label": "lasr/application-paraphrase", + "score": 1.0 + }, + { + "label": "linddun/recognition", + "score": 1.0 + }, + { + "label": "linddun/application-anchor", + "score": 1.0 + }, + { + "label": "linddun/application-paraphrase", + "score": 1.0 + }, + { + "label": "llm-evaluations/recognition", + "score": 1.0 + }, + { + "label": "llm-evaluations/application-anchor", + "score": 1.0 + }, + { + "label": "llm-evaluations/application-paraphrase", + "score": 1.0 + }, + { + "label": "madr/recognition", + "score": 1.0 + }, + { + "label": "madr/application-anchor", + "score": 1.0 + }, + { + "label": "madr/application-paraphrase", + "score": 1.0 + }, + { + "label": "mece/recognition", + "score": 1.0 + }, + { + "label": "mece/application-anchor", + "score": 1.0 + }, + { + "label": "mece/application-paraphrase", + "score": 1.0 + }, + { + "label": "morphological-box/recognition", + "score": 1.0 + }, + { + "label": "morphological-box/application-anchor", + "score": 1.0 + }, + { + "label": "morphological-box/application-paraphrase", + "score": 1.0 + }, + { + "label": "moscow/recognition", + "score": 1.0 + }, + { + "label": "moscow/application-anchor", + "score": 1.0 + }, + { + "label": "moscow/application-paraphrase", + "score": 0.75 + }, + { + "label": "mutation-testing/recognition", + "score": 1.0 + }, + { + "label": "mutation-testing/application-anchor", + "score": 1.0 + }, + { + "label": "mutation-testing/application-paraphrase", + "score": 1.0 + }, + { + "label": "negative-control/recognition", + "score": 1.0 + }, + { + "label": "nelson-rules/recognition", + "score": 1.0 + }, + { + "label": "nelson-rules/application-anchor", + "score": 1.0 + }, + { + "label": "nelson-rules/application-paraphrase", + "score": 1.0 + }, + { + "label": "owasp-top-10/recognition", + "score": 1.0 + }, + { + "label": "owasp-top-10/application-anchor", + "score": 1.0 + }, + { + "label": "owasp-top-10/application-paraphrase", + "score": 1.0 + }, + { + "label": "plain-english-strunk-white/recognition", + "score": 1.0 + }, + { + "label": "plain-english-strunk-white/application-anchor", + "score": 1.0 + }, + { + "label": "plain-english-strunk-white/application-paraphrase", + "score": 1.0 + }, + { + "label": "prd/recognition", + "score": 0.75 + }, + { + "label": "prd/application-anchor", + "score": 1.0 + }, + { + "label": "prd/application-paraphrase", + "score": 1.0 + }, + { + "label": "problem-space-nvc/recognition", + "score": 1.0 + }, + { + "label": "problem-space-nvc/application-anchor", + "score": 1.0 + }, + { + "label": "problem-space-nvc/application-paraphrase", + "score": 1.0 + }, + { + "label": "property-based-testing/recognition", + "score": 1.0 + }, + { + "label": "property-based-testing/application-anchor", + "score": 0.75 + }, + { + "label": "property-based-testing/application-paraphrase", + "score": 0.75 + }, + { + "label": "pyramid-principle/recognition", + "score": 1.0 + }, + { + "label": "pyramid-principle/application-anchor", + "score": 1.0 + }, + { + "label": "pyramid-principle/application-paraphrase", + "score": 1.0 + }, + { + "label": "sanity-check/recognition", + "score": 0.0 + }, + { + "label": "semantic-versioning/recognition", + "score": 1.0 + }, + { + "label": "semantic-versioning/application-anchor", + "score": 1.0 + }, + { + "label": "semantic-versioning/application-paraphrase", + "score": 1.0 + }, + { + "label": "socratic-method/recognition", + "score": 1.0 + }, + { + "label": "socratic-method/application-anchor", + "score": 1.0 + }, + { + "label": "socratic-method/application-paraphrase", + "score": 1.0 + }, + { + "label": "sota/recognition", + "score": 1.0 + }, + { + "label": "sota/application-anchor", + "score": 1.0 + }, + { + "label": "sota/application-paraphrase", + "score": 1.0 + }, + { + "label": "spc/recognition", + "score": 1.0 + }, + { + "label": "spc/application-anchor", + "score": 1.0 + }, + { + "label": "spc/application-paraphrase", + "score": 1.0 + }, + { + "label": "stride/recognition", + "score": 1.0 + }, + { + "label": "stride/application-anchor", + "score": 1.0 + }, + { + "label": "stride/application-paraphrase", + "score": 1.0 + }, + { + "label": "swot/recognition", + "score": 1.0 + }, + { + "label": "swot/application-anchor", + "score": 1.0 + }, + { + "label": "swot/application-paraphrase", + "score": 1.0 + }, + { + "label": "tdd-chicago-school/recognition", + "score": 0.75 + }, + { + "label": "tdd-chicago-school/application-anchor", + "score": 1.0 + }, + { + "label": "tdd-chicago-school/application-paraphrase", + "score": 1.0 + }, + { + "label": "tdd-london-school/recognition", + "score": 0.75 + }, + { + "label": "tdd-london-school/application-anchor", + "score": 1.0 + }, + { + "label": "tdd-london-school/application-paraphrase", + "score": 1.0 + }, + { + "label": "tdd-london-school/consistency-variant-1", + "score": 1.0 + }, + { + "label": "tdd-london-school/consistency-variant-2", + "score": 1.0 + }, + { + "label": "tdd-london-school/consistency-variant-3", + "score": 0.5 + }, + { + "label": "tdd-london-school/consistency-language", + "score": 1.0 + }, + { + "label": "testing-pyramid/recognition", + "score": 1.0 + }, + { + "label": "testing-pyramid/application-anchor", + "score": 1.0 + }, + { + "label": "testing-pyramid/application-paraphrase", + "score": 1.0 + }, + { + "label": "timtowtdi/recognition", + "score": 1.0 + }, + { + "label": "timtowtdi/application-anchor", + "score": 1.0 + }, + { + "label": "timtowtdi/application-paraphrase", + "score": 1.0 + }, + { + "label": "todotxt-flavoured-markdown/recognition", + "score": 1.0 + }, + { + "label": "todotxt-flavoured-markdown/application-anchor", + "score": 1.0 + }, + { + "label": "todotxt-flavoured-markdown/application-paraphrase", + "score": 1.0 + }, + { + "label": "user-story-mapping/recognition", + "score": 1.0 + }, + { + "label": "user-story-mapping/application-anchor", + "score": 1.0 + }, + { + "label": "user-story-mapping/application-paraphrase", + "score": 1.0 + }, + { + "label": "wardley-mapping/recognition", + "score": 1.0 + }, + { + "label": "wardley-mapping/application-anchor", + "score": 1.0 + }, + { + "label": "wardley-mapping/application-paraphrase", + "score": 1.0 + } + ] + }, + "duration_seconds": 938.4 +} \ No newline at end of file diff --git a/evaluations/summaries/pilot-20260326-070127_mistral-medium-2508.json b/evaluations/summaries/pilot-20260326-070127_mistral-medium-2508.json new file mode 100644 index 0000000..3b26758 --- /dev/null +++ b/evaluations/summaries/pilot-20260326-070127_mistral-medium-2508.json @@ -0,0 +1,792 @@ +{ + "timestamp": "2026-03-26T07:01:27.544539+00:00", + "config": { + "models": [ + "mistral" + ], + "openai_model": null, + "mistral_model": "mistral-medium-2508", + "deepseek_model": null, + "ollama_model": null, + "ollama_url": null, + "no_think": null, + "temperature": 0.0 + }, + "models": { + "mistral": [ + { + "label": "adr-according-to-nygard/recognition", + "score": 0.75 + }, + { + "label": "adr-according-to-nygard/application-anchor", + "score": 0.5 + }, + { + "label": "adr-according-to-nygard/application-paraphrase", + "score": 0.25 + }, + { + "label": "arc42/recognition", + "score": 0.75 + }, + { + "label": "arc42/application-anchor", + "score": 0.75 + }, + { + "label": "arc42/application-paraphrase", + "score": 0.5 + }, + { + "label": "arc42/consistency-variant-1", + "score": 0.25 + }, + { + "label": "arc42/consistency-variant-2", + "score": 0.5 + }, + { + "label": "arc42/consistency-variant-3", + "score": 0.5 + }, + { + "label": "arc42/consistency-language", + "score": 0.0 + }, + { + "label": "atam/recognition", + "score": 0.75 + }, + { + "label": "atam/application-anchor", + "score": 0.5 + }, + { + "label": "atam/application-paraphrase", + "score": 0.5 + }, + { + "label": "bdd-given-when-then/recognition", + "score": 0.25 + }, + { + "label": "bdd-given-when-then/application-anchor", + "score": 0.5 + }, + { + "label": "bdd-given-when-then/application-paraphrase", + "score": 0.25 + }, + { + "label": "bem-methodology/recognition", + "score": 0.5 + }, + { + "label": "bem-methodology/application-anchor", + "score": 0.5 + }, + { + "label": "bem-methodology/application-paraphrase", + "score": 0.25 + }, + { + "label": "bluf/recognition", + "score": 0.75 + }, + { + "label": "bluf/application-anchor", + "score": 0.75 + }, + { + "label": "bluf/application-paraphrase", + "score": 0.75 + }, + { + "label": "c4-diagrams/recognition", + "score": 0.25 + }, + { + "label": "c4-diagrams/application-anchor", + "score": 0.5 + }, + { + "label": "c4-diagrams/application-paraphrase", + "score": 0.0 + }, + { + "label": "chain-of-thought/recognition", + "score": 0.5 + }, + { + "label": "chain-of-thought/application-anchor", + "score": 0.5 + }, + { + "label": "chain-of-thought/application-paraphrase", + "score": 0.75 + }, + { + "label": "clean-architecture/recognition", + "score": 1.0 + }, + { + "label": "clean-architecture/application-anchor", + "score": 1.0 + }, + { + "label": "clean-architecture/application-paraphrase", + "score": 1.0 + }, + { + "label": "control-chart-shewhart/recognition", + "score": 1.0 + }, + { + "label": "control-chart-shewhart/application-anchor", + "score": 0.75 + }, + { + "label": "control-chart-shewhart/application-paraphrase", + "score": 1.0 + }, + { + "label": "conventional-commits/recognition", + "score": 1.0 + }, + { + "label": "conventional-commits/application-anchor", + "score": 1.0 + }, + { + "label": "conventional-commits/application-paraphrase", + "score": 1.0 + }, + { + "label": "cqrs/recognition", + "score": 1.0 + }, + { + "label": "cqrs/application-anchor", + "score": 0.75 + }, + { + "label": "cqrs/application-paraphrase", + "score": 1.0 + }, + { + "label": "cynefin-framework/recognition", + "score": 1.0 + }, + { + "label": "cynefin-framework/application-anchor", + "score": 0.75 + }, + { + "label": "cynefin-framework/application-paraphrase", + "score": 1.0 + }, + { + "label": "definition-of-done/recognition", + "score": 0.75 + }, + { + "label": "definition-of-done/application-anchor", + "score": 1.0 + }, + { + "label": "definition-of-done/application-paraphrase", + "score": 0.75 + }, + { + "label": "devils-advocate/recognition", + "score": 1.0 + }, + { + "label": "devils-advocate/application-anchor", + "score": 1.0 + }, + { + "label": "devils-advocate/application-paraphrase", + "score": 1.0 + }, + { + "label": "diataxis-framework/recognition", + "score": 0.75 + }, + { + "label": "diataxis-framework/application-anchor", + "score": 1.0 + }, + { + "label": "diataxis-framework/application-paraphrase", + "score": 1.0 + }, + { + "label": "docs-as-code/recognition", + "score": 1.0 + }, + { + "label": "docs-as-code/application-anchor", + "score": 1.0 + }, + { + "label": "docs-as-code/application-paraphrase", + "score": 0.75 + }, + { + "label": "domain-driven-design/recognition", + "score": 1.0 + }, + { + "label": "domain-driven-design/application-anchor", + "score": 1.0 + }, + { + "label": "domain-driven-design/application-paraphrase", + "score": 1.0 + }, + { + "label": "ears-requirements/recognition", + "score": 1.0 + }, + { + "label": "ears-requirements/application-anchor", + "score": 1.0 + }, + { + "label": "ears-requirements/application-paraphrase", + "score": 1.0 + }, + { + "label": "event-driven-architecture/recognition", + "score": 0.75 + }, + { + "label": "event-driven-architecture/application-anchor", + "score": 1.0 + }, + { + "label": "event-driven-architecture/application-paraphrase", + "score": 0.75 + }, + { + "label": "fagan-inspection/recognition", + "score": 1.0 + }, + { + "label": "fagan-inspection/application-anchor", + "score": 0.75 + }, + { + "label": "fagan-inspection/application-paraphrase", + "score": 0.75 + }, + { + "label": "feynman-technique/recognition", + "score": 1.0 + }, + { + "label": "feynman-technique/application-anchor", + "score": 1.0 + }, + { + "label": "feynman-technique/application-paraphrase", + "score": 0.75 + }, + { + "label": "five-whys/recognition", + "score": 1.0 + }, + { + "label": "five-whys/application-anchor", + "score": 1.0 + }, + { + "label": "five-whys/application-paraphrase", + "score": 1.0 + }, + { + "label": "fowler-patterns/recognition", + "score": 1.0 + }, + { + "label": "fowler-patterns/application-anchor", + "score": 0.75 + }, + { + "label": "fowler-patterns/application-paraphrase", + "score": 1.0 + }, + { + "label": "gherkin/recognition", + "score": 1.0 + }, + { + "label": "gherkin/application-anchor", + "score": 1.0 + }, + { + "label": "gherkin/application-paraphrase", + "score": 1.0 + }, + { + "label": "github-flow/recognition", + "score": 1.0 + }, + { + "label": "github-flow/application-anchor", + "score": 1.0 + }, + { + "label": "github-flow/application-paraphrase", + "score": 1.0 + }, + { + "label": "gutes-deutsch-wolf-schneider/recognition", + "score": 0.75 + }, + { + "label": "gutes-deutsch-wolf-schneider/application-anchor", + "score": 0.75 + }, + { + "label": "gutes-deutsch-wolf-schneider/application-paraphrase", + "score": 1.0 + }, + { + "label": "hexagonal-architecture/recognition", + "score": 1.0 + }, + { + "label": "hexagonal-architecture/application-anchor", + "score": 1.0 + }, + { + "label": "hexagonal-architecture/application-paraphrase", + "score": 0.75 + }, + { + "label": "iec-61508-sil-levels/recognition", + "score": 0.75 + }, + { + "label": "iec-61508-sil-levels/application-anchor", + "score": 1.0 + }, + { + "label": "iec-61508-sil-levels/application-paraphrase", + "score": 1.0 + }, + { + "label": "impact-mapping/recognition", + "score": 1.0 + }, + { + "label": "impact-mapping/application-anchor", + "score": 1.0 + }, + { + "label": "impact-mapping/application-paraphrase", + "score": 1.0 + }, + { + "label": "invest/recognition", + "score": 1.0 + }, + { + "label": "invest/application-anchor", + "score": 1.0 + }, + { + "label": "invest/application-paraphrase", + "score": 1.0 + }, + { + "label": "iso-25010/recognition", + "score": 1.0 + }, + { + "label": "iso-25010/application-anchor", + "score": 1.0 + }, + { + "label": "iso-25010/application-paraphrase", + "score": 1.0 + }, + { + "label": "jobs-to-be-done/recognition", + "score": 0.75 + }, + { + "label": "jobs-to-be-done/application-anchor", + "score": 1.0 + }, + { + "label": "jobs-to-be-done/application-paraphrase", + "score": 1.0 + }, + { + "label": "lasr/recognition", + "score": 1.0 + }, + { + "label": "lasr/application-anchor", + "score": 1.0 + }, + { + "label": "lasr/application-paraphrase", + "score": 1.0 + }, + { + "label": "linddun/recognition", + "score": 1.0 + }, + { + "label": "linddun/application-anchor", + "score": 1.0 + }, + { + "label": "linddun/application-paraphrase", + "score": 0.75 + }, + { + "label": "llm-evaluations/recognition", + "score": 1.0 + }, + { + "label": "llm-evaluations/application-anchor", + "score": 1.0 + }, + { + "label": "llm-evaluations/application-paraphrase", + "score": 1.0 + }, + { + "label": "madr/recognition", + "score": 0.25 + }, + { + "label": "madr/application-anchor", + "score": 1.0 + }, + { + "label": "madr/application-paraphrase", + "score": 1.0 + }, + { + "label": "mece/recognition", + "score": 1.0 + }, + { + "label": "mece/application-anchor", + "score": 0.75 + }, + { + "label": "mece/application-paraphrase", + "score": 1.0 + }, + { + "label": "morphological-box/recognition", + "score": 1.0 + }, + { + "label": "morphological-box/application-anchor", + "score": 0.75 + }, + { + "label": "morphological-box/application-paraphrase", + "score": 1.0 + }, + { + "label": "moscow/recognition", + "score": 1.0 + }, + { + "label": "moscow/application-anchor", + "score": 1.0 + }, + { + "label": "moscow/application-paraphrase", + "score": 0.75 + }, + { + "label": "mutation-testing/recognition", + "score": 1.0 + }, + { + "label": "mutation-testing/application-anchor", + "score": 0.75 + }, + { + "label": "mutation-testing/application-paraphrase", + "score": 1.0 + }, + { + "label": "negative-control/recognition", + "score": 1.0 + }, + { + "label": "nelson-rules/recognition", + "score": 1.0 + }, + { + "label": "nelson-rules/application-anchor", + "score": 0.75 + }, + { + "label": "nelson-rules/application-paraphrase", + "score": 1.0 + }, + { + "label": "owasp-top-10/recognition", + "score": 1.0 + }, + { + "label": "owasp-top-10/application-anchor", + "score": 0.75 + }, + { + "label": "owasp-top-10/application-paraphrase", + "score": 1.0 + }, + { + "label": "plain-english-strunk-white/recognition", + "score": 1.0 + }, + { + "label": "plain-english-strunk-white/application-anchor", + "score": 1.0 + }, + { + "label": "plain-english-strunk-white/application-paraphrase", + "score": 1.0 + }, + { + "label": "prd/recognition", + "score": 0.5 + }, + { + "label": "prd/application-anchor", + "score": 1.0 + }, + { + "label": "prd/application-paraphrase", + "score": 1.0 + }, + { + "label": "problem-space-nvc/recognition", + "score": 1.0 + }, + { + "label": "problem-space-nvc/application-anchor", + "score": 1.0 + }, + { + "label": "problem-space-nvc/application-paraphrase", + "score": 1.0 + }, + { + "label": "property-based-testing/recognition", + "score": 0.75 + }, + { + "label": "property-based-testing/application-anchor", + "score": 1.0 + }, + { + "label": "property-based-testing/application-paraphrase", + "score": 1.0 + }, + { + "label": "pyramid-principle/recognition", + "score": 1.0 + }, + { + "label": "pyramid-principle/application-anchor", + "score": 0.75 + }, + { + "label": "pyramid-principle/application-paraphrase", + "score": 1.0 + }, + { + "label": "sanity-check/recognition", + "score": 0.0 + }, + { + "label": "semantic-versioning/recognition", + "score": 1.0 + }, + { + "label": "semantic-versioning/application-anchor", + "score": 0.75 + }, + { + "label": "semantic-versioning/application-paraphrase", + "score": 0.75 + }, + { + "label": "socratic-method/recognition", + "score": 1.0 + }, + { + "label": "socratic-method/application-anchor", + "score": 0.75 + }, + { + "label": "socratic-method/application-paraphrase", + "score": 1.0 + }, + { + "label": "sota/recognition", + "score": 1.0 + }, + { + "label": "sota/application-anchor", + "score": 0.75 + }, + { + "label": "sota/application-paraphrase", + "score": 0.75 + }, + { + "label": "spc/recognition", + "score": 1.0 + }, + { + "label": "spc/application-anchor", + "score": 0.75 + }, + { + "label": "spc/application-paraphrase", + "score": 1.0 + }, + { + "label": "stride/recognition", + "score": 0.75 + }, + { + "label": "stride/application-anchor", + "score": 1.0 + }, + { + "label": "stride/application-paraphrase", + "score": 1.0 + }, + { + "label": "swot/recognition", + "score": 0.75 + }, + { + "label": "swot/application-anchor", + "score": 1.0 + }, + { + "label": "swot/application-paraphrase", + "score": 0.75 + }, + { + "label": "tdd-chicago-school/recognition", + "score": 1.0 + }, + { + "label": "tdd-chicago-school/application-anchor", + "score": 1.0 + }, + { + "label": "tdd-chicago-school/application-paraphrase", + "score": 1.0 + }, + { + "label": "tdd-london-school/recognition", + "score": 0.75 + }, + { + "label": "tdd-london-school/application-anchor", + "score": 1.0 + }, + { + "label": "tdd-london-school/application-paraphrase", + "score": 0.75 + }, + { + "label": "tdd-london-school/consistency-variant-1", + "score": 0.75 + }, + { + "label": "tdd-london-school/consistency-variant-2", + "score": 1.0 + }, + { + "label": "tdd-london-school/consistency-variant-3", + "score": 0.0 + }, + { + "label": "tdd-london-school/consistency-language", + "score": 0.75 + }, + { + "label": "testing-pyramid/recognition", + "score": 0.75 + }, + { + "label": "testing-pyramid/application-anchor", + "score": 1.0 + }, + { + "label": "testing-pyramid/application-paraphrase", + "score": 0.75 + }, + { + "label": "timtowtdi/recognition", + "score": 0.75 + }, + { + "label": "timtowtdi/application-anchor", + "score": 1.0 + }, + { + "label": "timtowtdi/application-paraphrase", + "score": 0.75 + }, + { + "label": "todotxt-flavoured-markdown/recognition", + "score": 0.75 + }, + { + "label": "todotxt-flavoured-markdown/application-anchor", + "score": 1.0 + }, + { + "label": "todotxt-flavoured-markdown/application-paraphrase", + "score": 1.0 + }, + { + "label": "user-story-mapping/recognition", + "score": 0.75 + }, + { + "label": "user-story-mapping/application-anchor", + "score": 1.0 + }, + { + "label": "user-story-mapping/application-paraphrase", + "score": 1.0 + }, + { + "label": "wardley-mapping/recognition", + "score": 1.0 + }, + { + "label": "wardley-mapping/application-anchor", + "score": 1.0 + }, + { + "label": "wardley-mapping/application-paraphrase", + "score": 1.0 + } + ] + }, + "duration_seconds": 1872.8 +} \ No newline at end of file diff --git a/evaluations/summaries/pilot-20260326-073241_devstral-2512.json b/evaluations/summaries/pilot-20260326-073241_devstral-2512.json new file mode 100644 index 0000000..e22ba86 --- /dev/null +++ b/evaluations/summaries/pilot-20260326-073241_devstral-2512.json @@ -0,0 +1,792 @@ +{ + "timestamp": "2026-03-26T07:32:41.051735+00:00", + "config": { + "models": [ + "mistral" + ], + "openai_model": null, + "mistral_model": "devstral-2512", + "deepseek_model": null, + "ollama_model": null, + "ollama_url": null, + "no_think": null, + "temperature": 0.0 + }, + "models": { + "mistral": [ + { + "label": "adr-according-to-nygard/recognition", + "score": 1.0 + }, + { + "label": "adr-according-to-nygard/application-anchor", + "score": 1.0 + }, + { + "label": "adr-according-to-nygard/application-paraphrase", + "score": 1.0 + }, + { + "label": "arc42/recognition", + "score": 1.0 + }, + { + "label": "arc42/application-anchor", + "score": 1.0 + }, + { + "label": "arc42/application-paraphrase", + "score": 1.0 + }, + { + "label": "arc42/consistency-variant-1", + "score": 1.0 + }, + { + "label": "arc42/consistency-variant-2", + "score": 1.0 + }, + { + "label": "arc42/consistency-variant-3", + "score": 1.0 + }, + { + "label": "arc42/consistency-language", + "score": 1.0 + }, + { + "label": "atam/recognition", + "score": 1.0 + }, + { + "label": "atam/application-anchor", + "score": 1.0 + }, + { + "label": "atam/application-paraphrase", + "score": 0.5 + }, + { + "label": "bdd-given-when-then/recognition", + "score": 0.0 + }, + { + "label": "bdd-given-when-then/application-anchor", + "score": 0.5 + }, + { + "label": "bdd-given-when-then/application-paraphrase", + "score": 1.0 + }, + { + "label": "bem-methodology/recognition", + "score": 1.0 + }, + { + "label": "bem-methodology/application-anchor", + "score": 1.0 + }, + { + "label": "bem-methodology/application-paraphrase", + "score": 1.0 + }, + { + "label": "bluf/recognition", + "score": 1.0 + }, + { + "label": "bluf/application-anchor", + "score": 1.0 + }, + { + "label": "bluf/application-paraphrase", + "score": 1.0 + }, + { + "label": "c4-diagrams/recognition", + "score": 1.0 + }, + { + "label": "c4-diagrams/application-anchor", + "score": 1.0 + }, + { + "label": "c4-diagrams/application-paraphrase", + "score": 1.0 + }, + { + "label": "chain-of-thought/recognition", + "score": 1.0 + }, + { + "label": "chain-of-thought/application-anchor", + "score": 1.0 + }, + { + "label": "chain-of-thought/application-paraphrase", + "score": 1.0 + }, + { + "label": "clean-architecture/recognition", + "score": 1.0 + }, + { + "label": "clean-architecture/application-anchor", + "score": 1.0 + }, + { + "label": "clean-architecture/application-paraphrase", + "score": 1.0 + }, + { + "label": "control-chart-shewhart/recognition", + "score": 1.0 + }, + { + "label": "control-chart-shewhart/application-anchor", + "score": 1.0 + }, + { + "label": "control-chart-shewhart/application-paraphrase", + "score": 1.0 + }, + { + "label": "conventional-commits/recognition", + "score": 1.0 + }, + { + "label": "conventional-commits/application-anchor", + "score": 1.0 + }, + { + "label": "conventional-commits/application-paraphrase", + "score": 1.0 + }, + { + "label": "cqrs/recognition", + "score": 1.0 + }, + { + "label": "cqrs/application-anchor", + "score": 1.0 + }, + { + "label": "cqrs/application-paraphrase", + "score": 1.0 + }, + { + "label": "cynefin-framework/recognition", + "score": 1.0 + }, + { + "label": "cynefin-framework/application-anchor", + "score": 1.0 + }, + { + "label": "cynefin-framework/application-paraphrase", + "score": 1.0 + }, + { + "label": "definition-of-done/recognition", + "score": 1.0 + }, + { + "label": "definition-of-done/application-anchor", + "score": 1.0 + }, + { + "label": "definition-of-done/application-paraphrase", + "score": 1.0 + }, + { + "label": "devils-advocate/recognition", + "score": 1.0 + }, + { + "label": "devils-advocate/application-anchor", + "score": 1.0 + }, + { + "label": "devils-advocate/application-paraphrase", + "score": 1.0 + }, + { + "label": "diataxis-framework/recognition", + "score": 1.0 + }, + { + "label": "diataxis-framework/application-anchor", + "score": 1.0 + }, + { + "label": "diataxis-framework/application-paraphrase", + "score": 1.0 + }, + { + "label": "docs-as-code/recognition", + "score": 1.0 + }, + { + "label": "docs-as-code/application-anchor", + "score": 1.0 + }, + { + "label": "docs-as-code/application-paraphrase", + "score": 1.0 + }, + { + "label": "domain-driven-design/recognition", + "score": 1.0 + }, + { + "label": "domain-driven-design/application-anchor", + "score": 1.0 + }, + { + "label": "domain-driven-design/application-paraphrase", + "score": 1.0 + }, + { + "label": "ears-requirements/recognition", + "score": 1.0 + }, + { + "label": "ears-requirements/application-anchor", + "score": 0.75 + }, + { + "label": "ears-requirements/application-paraphrase", + "score": 0.75 + }, + { + "label": "event-driven-architecture/recognition", + "score": 1.0 + }, + { + "label": "event-driven-architecture/application-anchor", + "score": 1.0 + }, + { + "label": "event-driven-architecture/application-paraphrase", + "score": 1.0 + }, + { + "label": "fagan-inspection/recognition", + "score": 1.0 + }, + { + "label": "fagan-inspection/application-anchor", + "score": 1.0 + }, + { + "label": "fagan-inspection/application-paraphrase", + "score": 1.0 + }, + { + "label": "feynman-technique/recognition", + "score": 1.0 + }, + { + "label": "feynman-technique/application-anchor", + "score": 0.75 + }, + { + "label": "feynman-technique/application-paraphrase", + "score": 0.75 + }, + { + "label": "five-whys/recognition", + "score": 1.0 + }, + { + "label": "five-whys/application-anchor", + "score": 1.0 + }, + { + "label": "five-whys/application-paraphrase", + "score": 1.0 + }, + { + "label": "fowler-patterns/recognition", + "score": 1.0 + }, + { + "label": "fowler-patterns/application-anchor", + "score": 1.0 + }, + { + "label": "fowler-patterns/application-paraphrase", + "score": 1.0 + }, + { + "label": "gherkin/recognition", + "score": 1.0 + }, + { + "label": "gherkin/application-anchor", + "score": 1.0 + }, + { + "label": "gherkin/application-paraphrase", + "score": 1.0 + }, + { + "label": "github-flow/recognition", + "score": 1.0 + }, + { + "label": "github-flow/application-anchor", + "score": 1.0 + }, + { + "label": "github-flow/application-paraphrase", + "score": 0.75 + }, + { + "label": "gutes-deutsch-wolf-schneider/recognition", + "score": 1.0 + }, + { + "label": "gutes-deutsch-wolf-schneider/application-anchor", + "score": 1.0 + }, + { + "label": "gutes-deutsch-wolf-schneider/application-paraphrase", + "score": 1.0 + }, + { + "label": "hexagonal-architecture/recognition", + "score": 1.0 + }, + { + "label": "hexagonal-architecture/application-anchor", + "score": 1.0 + }, + { + "label": "hexagonal-architecture/application-paraphrase", + "score": 1.0 + }, + { + "label": "iec-61508-sil-levels/recognition", + "score": 1.0 + }, + { + "label": "iec-61508-sil-levels/application-anchor", + "score": 0.75 + }, + { + "label": "iec-61508-sil-levels/application-paraphrase", + "score": 0.75 + }, + { + "label": "impact-mapping/recognition", + "score": 1.0 + }, + { + "label": "impact-mapping/application-anchor", + "score": 1.0 + }, + { + "label": "impact-mapping/application-paraphrase", + "score": 1.0 + }, + { + "label": "invest/recognition", + "score": 1.0 + }, + { + "label": "invest/application-anchor", + "score": 1.0 + }, + { + "label": "invest/application-paraphrase", + "score": 1.0 + }, + { + "label": "iso-25010/recognition", + "score": 1.0 + }, + { + "label": "iso-25010/application-anchor", + "score": 1.0 + }, + { + "label": "iso-25010/application-paraphrase", + "score": 1.0 + }, + { + "label": "jobs-to-be-done/recognition", + "score": 1.0 + }, + { + "label": "jobs-to-be-done/application-anchor", + "score": 1.0 + }, + { + "label": "jobs-to-be-done/application-paraphrase", + "score": 1.0 + }, + { + "label": "lasr/recognition", + "score": 0.5 + }, + { + "label": "lasr/application-anchor", + "score": 1.0 + }, + { + "label": "lasr/application-paraphrase", + "score": 1.0 + }, + { + "label": "linddun/recognition", + "score": 1.0 + }, + { + "label": "linddun/application-anchor", + "score": 1.0 + }, + { + "label": "linddun/application-paraphrase", + "score": 1.0 + }, + { + "label": "llm-evaluations/recognition", + "score": 1.0 + }, + { + "label": "llm-evaluations/application-anchor", + "score": 1.0 + }, + { + "label": "llm-evaluations/application-paraphrase", + "score": 1.0 + }, + { + "label": "madr/recognition", + "score": 0.5 + }, + { + "label": "madr/application-anchor", + "score": 1.0 + }, + { + "label": "madr/application-paraphrase", + "score": 1.0 + }, + { + "label": "mece/recognition", + "score": 1.0 + }, + { + "label": "mece/application-anchor", + "score": 1.0 + }, + { + "label": "mece/application-paraphrase", + "score": 1.0 + }, + { + "label": "morphological-box/recognition", + "score": 1.0 + }, + { + "label": "morphological-box/application-anchor", + "score": 1.0 + }, + { + "label": "morphological-box/application-paraphrase", + "score": 1.0 + }, + { + "label": "moscow/recognition", + "score": 1.0 + }, + { + "label": "moscow/application-anchor", + "score": 1.0 + }, + { + "label": "moscow/application-paraphrase", + "score": 1.0 + }, + { + "label": "mutation-testing/recognition", + "score": 1.0 + }, + { + "label": "mutation-testing/application-anchor", + "score": 1.0 + }, + { + "label": "mutation-testing/application-paraphrase", + "score": 1.0 + }, + { + "label": "negative-control/recognition", + "score": 0.75 + }, + { + "label": "nelson-rules/recognition", + "score": 1.0 + }, + { + "label": "nelson-rules/application-anchor", + "score": 1.0 + }, + { + "label": "nelson-rules/application-paraphrase", + "score": 1.0 + }, + { + "label": "owasp-top-10/recognition", + "score": 1.0 + }, + { + "label": "owasp-top-10/application-anchor", + "score": 1.0 + }, + { + "label": "owasp-top-10/application-paraphrase", + "score": 1.0 + }, + { + "label": "plain-english-strunk-white/recognition", + "score": 1.0 + }, + { + "label": "plain-english-strunk-white/application-anchor", + "score": 1.0 + }, + { + "label": "plain-english-strunk-white/application-paraphrase", + "score": 1.0 + }, + { + "label": "prd/recognition", + "score": 0.5 + }, + { + "label": "prd/application-anchor", + "score": 1.0 + }, + { + "label": "prd/application-paraphrase", + "score": 1.0 + }, + { + "label": "problem-space-nvc/recognition", + "score": 1.0 + }, + { + "label": "problem-space-nvc/application-anchor", + "score": 1.0 + }, + { + "label": "problem-space-nvc/application-paraphrase", + "score": 1.0 + }, + { + "label": "property-based-testing/recognition", + "score": 1.0 + }, + { + "label": "property-based-testing/application-anchor", + "score": 0.75 + }, + { + "label": "property-based-testing/application-paraphrase", + "score": 1.0 + }, + { + "label": "pyramid-principle/recognition", + "score": 1.0 + }, + { + "label": "pyramid-principle/application-anchor", + "score": 1.0 + }, + { + "label": "pyramid-principle/application-paraphrase", + "score": 1.0 + }, + { + "label": "sanity-check/recognition", + "score": 0.0 + }, + { + "label": "semantic-versioning/recognition", + "score": 1.0 + }, + { + "label": "semantic-versioning/application-anchor", + "score": 1.0 + }, + { + "label": "semantic-versioning/application-paraphrase", + "score": 1.0 + }, + { + "label": "socratic-method/recognition", + "score": 1.0 + }, + { + "label": "socratic-method/application-anchor", + "score": 1.0 + }, + { + "label": "socratic-method/application-paraphrase", + "score": 1.0 + }, + { + "label": "sota/recognition", + "score": 1.0 + }, + { + "label": "sota/application-anchor", + "score": 1.0 + }, + { + "label": "sota/application-paraphrase", + "score": 1.0 + }, + { + "label": "spc/recognition", + "score": 1.0 + }, + { + "label": "spc/application-anchor", + "score": 1.0 + }, + { + "label": "spc/application-paraphrase", + "score": 1.0 + }, + { + "label": "stride/recognition", + "score": 1.0 + }, + { + "label": "stride/application-anchor", + "score": 1.0 + }, + { + "label": "stride/application-paraphrase", + "score": 1.0 + }, + { + "label": "swot/recognition", + "score": 1.0 + }, + { + "label": "swot/application-anchor", + "score": 1.0 + }, + { + "label": "swot/application-paraphrase", + "score": 1.0 + }, + { + "label": "tdd-chicago-school/recognition", + "score": 0.75 + }, + { + "label": "tdd-chicago-school/application-anchor", + "score": 1.0 + }, + { + "label": "tdd-chicago-school/application-paraphrase", + "score": 1.0 + }, + { + "label": "tdd-london-school/recognition", + "score": 1.0 + }, + { + "label": "tdd-london-school/application-anchor", + "score": 1.0 + }, + { + "label": "tdd-london-school/application-paraphrase", + "score": 1.0 + }, + { + "label": "tdd-london-school/consistency-variant-1", + "score": 1.0 + }, + { + "label": "tdd-london-school/consistency-variant-2", + "score": 1.0 + }, + { + "label": "tdd-london-school/consistency-variant-3", + "score": 0.25 + }, + { + "label": "tdd-london-school/consistency-language", + "score": 1.0 + }, + { + "label": "testing-pyramid/recognition", + "score": 1.0 + }, + { + "label": "testing-pyramid/application-anchor", + "score": 1.0 + }, + { + "label": "testing-pyramid/application-paraphrase", + "score": 1.0 + }, + { + "label": "timtowtdi/recognition", + "score": 1.0 + }, + { + "label": "timtowtdi/application-anchor", + "score": 1.0 + }, + { + "label": "timtowtdi/application-paraphrase", + "score": 1.0 + }, + { + "label": "todotxt-flavoured-markdown/recognition", + "score": 0.5 + }, + { + "label": "todotxt-flavoured-markdown/application-anchor", + "score": 1.0 + }, + { + "label": "todotxt-flavoured-markdown/application-paraphrase", + "score": 1.0 + }, + { + "label": "user-story-mapping/recognition", + "score": 1.0 + }, + { + "label": "user-story-mapping/application-anchor", + "score": 1.0 + }, + { + "label": "user-story-mapping/application-paraphrase", + "score": 1.0 + }, + { + "label": "wardley-mapping/recognition", + "score": 1.0 + }, + { + "label": "wardley-mapping/application-anchor", + "score": 1.0 + }, + { + "label": "wardley-mapping/application-paraphrase", + "score": 1.0 + } + ] + }, + "duration_seconds": 919.6 +} \ No newline at end of file diff --git a/evaluations/summaries/pilot-20260326-074132_mistral-small-2603.json b/evaluations/summaries/pilot-20260326-074132_mistral-small-2603.json new file mode 100644 index 0000000..d7d851e --- /dev/null +++ b/evaluations/summaries/pilot-20260326-074132_mistral-small-2603.json @@ -0,0 +1,792 @@ +{ + "timestamp": "2026-03-26T07:41:32.011654+00:00", + "config": { + "models": [ + "mistral" + ], + "openai_model": null, + "mistral_model": "mistral-small-2603", + "deepseek_model": null, + "ollama_model": null, + "ollama_url": null, + "no_think": null, + "temperature": 0.0 + }, + "models": { + "mistral": [ + { + "label": "adr-according-to-nygard/recognition", + "score": 0.75 + }, + { + "label": "adr-according-to-nygard/application-anchor", + "score": 0.75 + }, + { + "label": "adr-according-to-nygard/application-paraphrase", + "score": 1.0 + }, + { + "label": "arc42/recognition", + "score": 1.0 + }, + { + "label": "arc42/application-anchor", + "score": 1.0 + }, + { + "label": "arc42/application-paraphrase", + "score": 1.0 + }, + { + "label": "arc42/consistency-variant-1", + "score": 1.0 + }, + { + "label": "arc42/consistency-variant-2", + "score": 1.0 + }, + { + "label": "arc42/consistency-variant-3", + "score": 1.0 + }, + { + "label": "arc42/consistency-language", + "score": 1.0 + }, + { + "label": "atam/recognition", + "score": 1.0 + }, + { + "label": "atam/application-anchor", + "score": 0.75 + }, + { + "label": "atam/application-paraphrase", + "score": 1.0 + }, + { + "label": "bdd-given-when-then/recognition", + "score": 1.0 + }, + { + "label": "bdd-given-when-then/application-anchor", + "score": 1.0 + }, + { + "label": "bdd-given-when-then/application-paraphrase", + "score": 0.75 + }, + { + "label": "bem-methodology/recognition", + "score": 1.0 + }, + { + "label": "bem-methodology/application-anchor", + "score": 1.0 + }, + { + "label": "bem-methodology/application-paraphrase", + "score": 1.0 + }, + { + "label": "bluf/recognition", + "score": 1.0 + }, + { + "label": "bluf/application-anchor", + "score": 1.0 + }, + { + "label": "bluf/application-paraphrase", + "score": 1.0 + }, + { + "label": "c4-diagrams/recognition", + "score": 1.0 + }, + { + "label": "c4-diagrams/application-anchor", + "score": 1.0 + }, + { + "label": "c4-diagrams/application-paraphrase", + "score": 1.0 + }, + { + "label": "chain-of-thought/recognition", + "score": 1.0 + }, + { + "label": "chain-of-thought/application-anchor", + "score": 1.0 + }, + { + "label": "chain-of-thought/application-paraphrase", + "score": 1.0 + }, + { + "label": "clean-architecture/recognition", + "score": 1.0 + }, + { + "label": "clean-architecture/application-anchor", + "score": 1.0 + }, + { + "label": "clean-architecture/application-paraphrase", + "score": 1.0 + }, + { + "label": "control-chart-shewhart/recognition", + "score": 1.0 + }, + { + "label": "control-chart-shewhart/application-anchor", + "score": 1.0 + }, + { + "label": "control-chart-shewhart/application-paraphrase", + "score": 1.0 + }, + { + "label": "conventional-commits/recognition", + "score": 1.0 + }, + { + "label": "conventional-commits/application-anchor", + "score": 0.5 + }, + { + "label": "conventional-commits/application-paraphrase", + "score": 0.75 + }, + { + "label": "cqrs/recognition", + "score": 1.0 + }, + { + "label": "cqrs/application-anchor", + "score": 0.75 + }, + { + "label": "cqrs/application-paraphrase", + "score": 0.5 + }, + { + "label": "cynefin-framework/recognition", + "score": 1.0 + }, + { + "label": "cynefin-framework/application-anchor", + "score": 1.0 + }, + { + "label": "cynefin-framework/application-paraphrase", + "score": 1.0 + }, + { + "label": "definition-of-done/recognition", + "score": 0.75 + }, + { + "label": "definition-of-done/application-anchor", + "score": 0.75 + }, + { + "label": "definition-of-done/application-paraphrase", + "score": 0.5 + }, + { + "label": "devils-advocate/recognition", + "score": 0.5 + }, + { + "label": "devils-advocate/application-anchor", + "score": 0.75 + }, + { + "label": "devils-advocate/application-paraphrase", + "score": 0.5 + }, + { + "label": "diataxis-framework/recognition", + "score": 0.25 + }, + { + "label": "diataxis-framework/application-anchor", + "score": 0.75 + }, + { + "label": "diataxis-framework/application-paraphrase", + "score": 0.75 + }, + { + "label": "docs-as-code/recognition", + "score": 1.0 + }, + { + "label": "docs-as-code/application-anchor", + "score": 1.0 + }, + { + "label": "docs-as-code/application-paraphrase", + "score": 0.5 + }, + { + "label": "domain-driven-design/recognition", + "score": 0.5 + }, + { + "label": "domain-driven-design/application-anchor", + "score": 1.0 + }, + { + "label": "domain-driven-design/application-paraphrase", + "score": 0.5 + }, + { + "label": "ears-requirements/recognition", + "score": 0.0 + }, + { + "label": "ears-requirements/application-anchor", + "score": 0.0 + }, + { + "label": "ears-requirements/application-paraphrase", + "score": 0.5 + }, + { + "label": "event-driven-architecture/recognition", + "score": 0.75 + }, + { + "label": "event-driven-architecture/application-anchor", + "score": 0.75 + }, + { + "label": "event-driven-architecture/application-paraphrase", + "score": 0.75 + }, + { + "label": "fagan-inspection/recognition", + "score": 0.75 + }, + { + "label": "fagan-inspection/application-anchor", + "score": 0.25 + }, + { + "label": "fagan-inspection/application-paraphrase", + "score": 0.75 + }, + { + "label": "feynman-technique/recognition", + "score": 1.0 + }, + { + "label": "feynman-technique/application-anchor", + "score": 0.5 + }, + { + "label": "feynman-technique/application-paraphrase", + "score": 0.75 + }, + { + "label": "five-whys/recognition", + "score": 1.0 + }, + { + "label": "five-whys/application-anchor", + "score": 0.75 + }, + { + "label": "five-whys/application-paraphrase", + "score": 1.0 + }, + { + "label": "fowler-patterns/recognition", + "score": 0.5 + }, + { + "label": "fowler-patterns/application-anchor", + "score": 0.75 + }, + { + "label": "fowler-patterns/application-paraphrase", + "score": 1.0 + }, + { + "label": "gherkin/recognition", + "score": 0.75 + }, + { + "label": "gherkin/application-anchor", + "score": 0.75 + }, + { + "label": "gherkin/application-paraphrase", + "score": 1.0 + }, + { + "label": "github-flow/recognition", + "score": 0.75 + }, + { + "label": "github-flow/application-anchor", + "score": 0.5 + }, + { + "label": "github-flow/application-paraphrase", + "score": 0.0 + }, + { + "label": "gutes-deutsch-wolf-schneider/recognition", + "score": 0.75 + }, + { + "label": "gutes-deutsch-wolf-schneider/application-anchor", + "score": 1.0 + }, + { + "label": "gutes-deutsch-wolf-schneider/application-paraphrase", + "score": 1.0 + }, + { + "label": "hexagonal-architecture/recognition", + "score": 0.75 + }, + { + "label": "hexagonal-architecture/application-anchor", + "score": 1.0 + }, + { + "label": "hexagonal-architecture/application-paraphrase", + "score": 1.0 + }, + { + "label": "iec-61508-sil-levels/recognition", + "score": 1.0 + }, + { + "label": "iec-61508-sil-levels/application-anchor", + "score": 0.0 + }, + { + "label": "iec-61508-sil-levels/application-paraphrase", + "score": 0.0 + }, + { + "label": "impact-mapping/recognition", + "score": 0.75 + }, + { + "label": "impact-mapping/application-anchor", + "score": 0.5 + }, + { + "label": "impact-mapping/application-paraphrase", + "score": 0.25 + }, + { + "label": "invest/recognition", + "score": 0.25 + }, + { + "label": "invest/application-anchor", + "score": 1.0 + }, + { + "label": "invest/application-paraphrase", + "score": 1.0 + }, + { + "label": "iso-25010/recognition", + "score": 1.0 + }, + { + "label": "iso-25010/application-anchor", + "score": 0.5 + }, + { + "label": "iso-25010/application-paraphrase", + "score": 0.75 + }, + { + "label": "jobs-to-be-done/recognition", + "score": 1.0 + }, + { + "label": "jobs-to-be-done/application-anchor", + "score": 0.5 + }, + { + "label": "jobs-to-be-done/application-paraphrase", + "score": 1.0 + }, + { + "label": "lasr/recognition", + "score": 0.5 + }, + { + "label": "lasr/application-anchor", + "score": 0.5 + }, + { + "label": "lasr/application-paraphrase", + "score": 0.75 + }, + { + "label": "linddun/recognition", + "score": 1.0 + }, + { + "label": "linddun/application-anchor", + "score": 1.0 + }, + { + "label": "linddun/application-paraphrase", + "score": 0.5 + }, + { + "label": "llm-evaluations/recognition", + "score": 0.75 + }, + { + "label": "llm-evaluations/application-anchor", + "score": 0.75 + }, + { + "label": "llm-evaluations/application-paraphrase", + "score": 1.0 + }, + { + "label": "madr/recognition", + "score": 0.75 + }, + { + "label": "madr/application-anchor", + "score": 0.5 + }, + { + "label": "madr/application-paraphrase", + "score": 0.75 + }, + { + "label": "mece/recognition", + "score": 1.0 + }, + { + "label": "mece/application-anchor", + "score": 0.5 + }, + { + "label": "mece/application-paraphrase", + "score": 1.0 + }, + { + "label": "morphological-box/recognition", + "score": 1.0 + }, + { + "label": "morphological-box/application-anchor", + "score": 0.25 + }, + { + "label": "morphological-box/application-paraphrase", + "score": 0.75 + }, + { + "label": "moscow/recognition", + "score": 0.75 + }, + { + "label": "moscow/application-anchor", + "score": 1.0 + }, + { + "label": "moscow/application-paraphrase", + "score": 0.5 + }, + { + "label": "mutation-testing/recognition", + "score": 0.75 + }, + { + "label": "mutation-testing/application-anchor", + "score": 0.75 + }, + { + "label": "mutation-testing/application-paraphrase", + "score": 0.25 + }, + { + "label": "negative-control/recognition", + "score": 0.5 + }, + { + "label": "nelson-rules/recognition", + "score": 1.0 + }, + { + "label": "nelson-rules/application-anchor", + "score": 0.5 + }, + { + "label": "nelson-rules/application-paraphrase", + "score": 1.0 + }, + { + "label": "owasp-top-10/recognition", + "score": 0.75 + }, + { + "label": "owasp-top-10/application-anchor", + "score": 0.75 + }, + { + "label": "owasp-top-10/application-paraphrase", + "score": 0.5 + }, + { + "label": "plain-english-strunk-white/recognition", + "score": 0.5 + }, + { + "label": "plain-english-strunk-white/application-anchor", + "score": 0.5 + }, + { + "label": "plain-english-strunk-white/application-paraphrase", + "score": 0.75 + }, + { + "label": "prd/recognition", + "score": 0.25 + }, + { + "label": "prd/application-anchor", + "score": 0.5 + }, + { + "label": "prd/application-paraphrase", + "score": 1.0 + }, + { + "label": "problem-space-nvc/recognition", + "score": 0.75 + }, + { + "label": "problem-space-nvc/application-anchor", + "score": 0.25 + }, + { + "label": "problem-space-nvc/application-paraphrase", + "score": 0.5 + }, + { + "label": "property-based-testing/recognition", + "score": 0.75 + }, + { + "label": "property-based-testing/application-anchor", + "score": 0.25 + }, + { + "label": "property-based-testing/application-paraphrase", + "score": 1.0 + }, + { + "label": "pyramid-principle/recognition", + "score": 1.0 + }, + { + "label": "pyramid-principle/application-anchor", + "score": 0.75 + }, + { + "label": "pyramid-principle/application-paraphrase", + "score": 0.75 + }, + { + "label": "sanity-check/recognition", + "score": 0.0 + }, + { + "label": "semantic-versioning/recognition", + "score": 1.0 + }, + { + "label": "semantic-versioning/application-anchor", + "score": 1.0 + }, + { + "label": "semantic-versioning/application-paraphrase", + "score": 0.75 + }, + { + "label": "socratic-method/recognition", + "score": 0.25 + }, + { + "label": "socratic-method/application-anchor", + "score": 1.0 + }, + { + "label": "socratic-method/application-paraphrase", + "score": 0.75 + }, + { + "label": "sota/recognition", + "score": 1.0 + }, + { + "label": "sota/application-anchor", + "score": 0.5 + }, + { + "label": "sota/application-paraphrase", + "score": 0.75 + }, + { + "label": "spc/recognition", + "score": 0.5 + }, + { + "label": "spc/application-anchor", + "score": 0.25 + }, + { + "label": "spc/application-paraphrase", + "score": 0.75 + }, + { + "label": "stride/recognition", + "score": 0.5 + }, + { + "label": "stride/application-anchor", + "score": 0.5 + }, + { + "label": "stride/application-paraphrase", + "score": 1.0 + }, + { + "label": "swot/recognition", + "score": 1.0 + }, + { + "label": "swot/application-anchor", + "score": 0.25 + }, + { + "label": "swot/application-paraphrase", + "score": 0.75 + }, + { + "label": "tdd-chicago-school/recognition", + "score": 0.5 + }, + { + "label": "tdd-chicago-school/application-anchor", + "score": 1.0 + }, + { + "label": "tdd-chicago-school/application-paraphrase", + "score": 0.5 + }, + { + "label": "tdd-london-school/recognition", + "score": 0.5 + }, + { + "label": "tdd-london-school/application-anchor", + "score": 0.5 + }, + { + "label": "tdd-london-school/application-paraphrase", + "score": 0.75 + }, + { + "label": "tdd-london-school/consistency-variant-1", + "score": 0.5 + }, + { + "label": "tdd-london-school/consistency-variant-2", + "score": 0.75 + }, + { + "label": "tdd-london-school/consistency-variant-3", + "score": 0.25 + }, + { + "label": "tdd-london-school/consistency-language", + "score": 0.5 + }, + { + "label": "testing-pyramid/recognition", + "score": 1.0 + }, + { + "label": "testing-pyramid/application-anchor", + "score": 1.0 + }, + { + "label": "testing-pyramid/application-paraphrase", + "score": 0.75 + }, + { + "label": "timtowtdi/recognition", + "score": 0.75 + }, + { + "label": "timtowtdi/application-anchor", + "score": 0.75 + }, + { + "label": "timtowtdi/application-paraphrase", + "score": 1.0 + }, + { + "label": "todotxt-flavoured-markdown/recognition", + "score": 0.75 + }, + { + "label": "todotxt-flavoured-markdown/application-anchor", + "score": 0.75 + }, + { + "label": "todotxt-flavoured-markdown/application-paraphrase", + "score": 0.75 + }, + { + "label": "user-story-mapping/recognition", + "score": 0.75 + }, + { + "label": "user-story-mapping/application-anchor", + "score": 1.0 + }, + { + "label": "user-story-mapping/application-paraphrase", + "score": 0.5 + }, + { + "label": "wardley-mapping/recognition", + "score": 0.5 + }, + { + "label": "wardley-mapping/application-anchor", + "score": 0.5 + }, + { + "label": "wardley-mapping/application-paraphrase", + "score": 0.25 + } + ] + }, + "duration_seconds": 1089.2 +} \ No newline at end of file diff --git a/website/public/evaluation-report.html b/website/public/evaluation-report.html index 0fd9fb2..584a172 100644 --- a/website/public/evaluation-report.html +++ b/website/public/evaluation-report.html @@ -37,7 +37,7 @@

Semantic Anchor Evaluation Report

-

Multiple-choice recognition test across 3 LLMs — 191 questions, 61 anchors

+

Multiple-choice recognition test across 6 LLMs — 191 questions, 61 anchors

≥80%
@@ -48,298 +48,336 @@

Semantic Anchor Evaluation Report

Model Summary

-
Claude Sonnet
+
claude-sonnet-4-20250514
99%
191 questions · pilot-20260324-174404.json
-
GPT-4o
+
devstral-2512
+
96%
+
191 questions · pilot-20260326-073241_devstral-2512.json
+
+
+
gpt-4o
98%
191 questions · pilot-20260324-192413.json
-
Mistral Large
+
mistral-large-2512
96%
191 questions · pilot-20260324-190600.json
+
+
mistral-medium-2508
+
85%
+
191 questions · pilot-20260326-070127_mistral-medium-2508.json
+
+
+
mistral-small-2603
+
74%
+
191 questions · pilot-20260326-074132_mistral-small-2603.json
+

Heatmap: Anchor × Model

- - - + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Anchor / QuestionClaude SonnetGPT-4oMistral Largeclaude-sonnet-4-20250514devstral-2512gpt-4omistral-large-2512mistral-medium-2508mistral-small-2603
adr-according-to-nygard92%
application-anchor
application-paraphrase
recognition75%
arc42
application-anchor
application-paraphrase
consistency-language
consistency-variant-1
consistency-variant-2
consistency-variant-3
recognition
atam
application-anchor
application-paraphrase
recognition
bdd-given-when-then83%
application-anchor
application-paraphrase50%
recognition
bem-methodology
application-anchor
application-paraphrase
recognition
bluf
application-anchor
application-paraphrase
recognition
c4-diagrams
application-anchor
application-paraphrase
recognition
chain-of-thought
application-anchor
application-paraphrase
recognition
clean-architecture
application-anchor
application-paraphrase
recognition
control-chart-shewhart92%
application-anchor
application-paraphrase75%
recognition
conventional-commits
application-anchor
application-paraphrase
recognition
cqrs
application-anchor
application-paraphrase
recognition
cynefin-framework
application-anchor
application-paraphrase
recognition
definition-of-done
application-anchor
application-paraphrase
recognition
devils-advocate
application-anchor
application-paraphrase
recognition
diataxis-framework
application-anchor
application-paraphrase
recognition
docs-as-code
application-anchor
application-paraphrase
recognition
domain-driven-design
application-anchor
application-paraphrase
recognition
ears-requirements92%83%
application-anchor
application-paraphrase75%75%
recognition75%
event-driven-architecture
application-anchor
application-paraphrase
recognition
fagan-inspection
application-anchor
application-paraphrase
recognition
feynman-technique67%67%92%
application-anchor
application-paraphrase0%0%75%
recognition
five-whys
application-anchor
application-paraphrase
recognition
fowler-patterns
application-anchor
application-paraphrase
recognition
gherkin
application-anchor
application-paraphrase
recognition
github-flow92%92%92%
application-anchor
application-paraphrase75%75%75%
recognition
gutes-deutsch-wolf-schneider
application-anchor
application-paraphrase
recognition
hexagonal-architecture
application-anchor
application-paraphrase
recognition
iec-61508-sil-levels92%83%
application-anchor50%
application-paraphrase75%
recognition
impact-mapping
application-anchor
application-paraphrase
recognition
invest
application-anchor
application-paraphrase
recognition
iso-2501083%
application-anchor75%
application-paraphrase75%
recognition
jobs-to-be-done
application-anchor
application-paraphrase
recognition
lasr92%75%
application-anchor
application-paraphrase
recognition75%25%
linddun
application-anchor
application-paraphrase
recognition
llm-evaluations
application-anchor
application-paraphrase
recognition
madr
application-anchor
application-paraphrase
recognition
mece
application-anchor
application-paraphrase
recognition
morphological-box
application-anchor
application-paraphrase
recognition
moscow92%75%
application-anchor
application-paraphrase75%25%
recognition
mutation-testing
application-anchor
application-paraphrase
recognition
nelson-rules
application-anchor
application-paraphrase
recognition
owasp-top-10
application-anchor
application-paraphrase
recognition
plain-english-strunk-white
application-anchor
application-paraphrase
recognition
prd92%67%
application-anchor
application-paraphrase
recognition75%0%
problem-space-nvc83%
application-anchor75%
application-paraphrase75%
recognition
property-based-testing83%
application-anchor75%
application-paraphrase75%
recognition
pyramid-principle
application-anchor
application-paraphrase
recognition
semantic-versioning75%
application-anchor50%
application-paraphrase75%
recognition
socratic-method
application-anchor
application-paraphrase
recognition
sota
application-anchor
application-paraphrase
recognition
spc
application-anchor
application-paraphrase
recognition
stride
application-anchor
application-paraphrase
recognition
swot
application-anchor
application-paraphrase
recognition
tdd-chicago-school92%
application-anchor
application-paraphrase
recognition75%
tdd-london-school89%
application-anchor
application-paraphrase
consistency-language
consistency-variant-1
consistency-variant-2
consistency-variant-350%
recognition75%
testing-pyramid
application-anchor
application-paraphrase
recognition
timtowtdi
application-anchor
application-paraphrase
recognition
todotxt-flavoured-markdown83%
application-anchor
application-paraphrase
recognition50%
user-story-mapping
application-anchor
application-paraphrase
recognition
wardley-mapping
application-anchor
application-paraphrase
recognition
adr-according-to-nygard92%50%83%
application-anchor50%75%
application-paraphrase25%
recognition75%75%75%
arc4246%
application-anchor75%
application-paraphrase50%
consistency-language0%
consistency-variant-125%
consistency-variant-250%
consistency-variant-350%
recognition75%
atam83%58%92%
application-anchor50%75%
application-paraphrase50%50%
recognition75%
bdd-given-when-then50%83%33%92%
application-anchor50%50%
application-paraphrase50%25%75%
recognition0%25%
bem-methodology42%
application-anchor50%
application-paraphrase25%
recognition50%
bluf75%
application-anchor75%
application-paraphrase75%
recognition75%
c4-diagrams25%
application-anchor50%
application-paraphrase0%
recognition25%
chain-of-thought58%
application-anchor50%
application-paraphrase75%
recognition50%
clean-architecture
application-anchor
application-paraphrase
recognition
control-chart-shewhart92%92%
application-anchor75%
application-paraphrase75%
recognition
conventional-commits75%
application-anchor50%
application-paraphrase75%
recognition
cqrs92%75%
application-anchor75%75%
application-paraphrase50%
recognition
cynefin-framework92%
application-anchor75%
application-paraphrase
recognition
definition-of-done83%67%
application-anchor75%
application-paraphrase75%50%
recognition75%75%
devils-advocate58%
application-anchor75%
application-paraphrase50%
recognition50%
diataxis-framework92%58%
application-anchor75%
application-paraphrase75%
recognition75%25%
docs-as-code92%83%
application-anchor
application-paraphrase75%50%
recognition
domain-driven-design67%
application-anchor
application-paraphrase50%
recognition50%
ears-requirements83%92%83%17%
application-anchor75%0%
application-paraphrase75%75%75%50%
recognition75%0%
event-driven-architecture83%75%
application-anchor75%
application-paraphrase75%75%
recognition75%75%
fagan-inspection83%58%
application-anchor75%25%
application-paraphrase75%75%
recognition75%
feynman-technique67%83%67%92%92%75%
application-anchor75%50%
application-paraphrase0%75%0%75%75%75%
recognition
five-whys92%
application-anchor75%
application-paraphrase
recognition
fowler-patterns92%75%
application-anchor75%75%
application-paraphrase
recognition50%
gherkin83%
application-anchor75%
application-paraphrase
recognition75%
github-flow92%92%92%92%42%
application-anchor50%
application-paraphrase75%75%75%75%0%
recognition75%
gutes-deutsch-wolf-schneider83%92%
application-anchor75%
application-paraphrase
recognition75%75%
hexagonal-architecture92%92%
application-anchor
application-paraphrase75%
recognition75%
iec-61508-sil-levels83%92%83%92%33%
application-anchor75%50%0%
application-paraphrase75%75%0%
recognition75%
impact-mapping50%
application-anchor50%
application-paraphrase25%
recognition75%
invest75%
application-anchor
application-paraphrase
recognition25%
iso-2501083%75%
application-anchor75%50%
application-paraphrase75%75%
recognition
jobs-to-be-done92%83%
application-anchor50%
application-paraphrase
recognition75%
lasr83%92%75%58%
application-anchor50%
application-paraphrase75%
recognition50%75%25%50%
linddun92%83%
application-anchor
application-paraphrase75%50%
recognition
llm-evaluations83%
application-anchor75%
application-paraphrase
recognition75%
madr83%75%67%
application-anchor50%
application-paraphrase75%
recognition50%25%75%
mece92%83%
application-anchor75%50%
application-paraphrase
recognition
morphological-box92%67%
application-anchor75%25%
application-paraphrase75%
recognition
moscow92%75%92%75%
application-anchor
application-paraphrase75%25%75%50%
recognition75%
mutation-testing92%58%
application-anchor75%75%
application-paraphrase25%
recognition75%
nelson-rules92%83%
application-anchor75%50%
application-paraphrase
recognition
owasp-top-1092%67%
application-anchor75%75%
application-paraphrase50%
recognition75%
plain-english-strunk-white58%
application-anchor50%
application-paraphrase75%
recognition50%
prd83%92%67%83%58%
application-anchor50%
application-paraphrase
recognition50%75%0%50%25%
problem-space-nvc83%50%
application-anchor75%25%
application-paraphrase75%50%
recognition75%
property-based-testing92%83%92%67%
application-anchor75%75%25%
application-paraphrase75%
recognition75%75%
pyramid-principle92%83%
application-anchor75%75%
application-paraphrase75%
recognition
semantic-versioning75%83%92%
application-anchor50%75%
application-paraphrase75%75%75%
recognition
socratic-method92%67%
application-anchor75%
application-paraphrase75%
recognition25%
sota83%75%
application-anchor75%50%
application-paraphrase75%75%
recognition
spc92%50%
application-anchor75%25%
application-paraphrase75%
recognition50%
stride92%67%
application-anchor50%
application-paraphrase
recognition75%50%
swot83%67%
application-anchor25%
application-paraphrase75%75%
recognition75%
tdd-chicago-school92%92%67%
application-anchor
application-paraphrase50%
recognition75%75%50%
tdd-london-school89%89%71%54%
application-anchor50%
application-paraphrase75%75%
consistency-language75%50%
consistency-variant-175%50%
consistency-variant-275%
consistency-variant-325%50%0%25%
recognition75%75%50%
testing-pyramid83%92%
application-anchor
application-paraphrase75%75%
recognition75%
timtowtdi83%83%
application-anchor75%
application-paraphrase75%
recognition75%75%
todotxt-flavoured-markdown83%83%92%75%
application-anchor75%
application-paraphrase75%
recognition50%50%75%75%
user-story-mapping92%75%
application-anchor
application-paraphrase50%
recognition75%75%
wardley-mapping42%
application-anchor50%
application-paraphrase25%
recognition50%

Control Questions

- + - - + +
ControlClaude SonnetGPT-4oMistral Large
Controlclaude-sonnet-4-20250514devstral-2512gpt-4omistral-large-2512mistral-medium-2508mistral-small-2603
negative-control100%100%75%
sanity-check0%0%0%
negative-control100%75%100%75%100%50%
sanity-check0%0%0%0%0%0%

Failures Detail

-

Claude Sonnet: 2 failures

+

claude-sonnet-4-20250514: 2 failures

feynman-technique/application-paraphrase0%
github-flow/application-paraphrase75%
-

GPT-4o: 13 failures

+

devstral-2512: 17 failures

+
+
atam/application-paraphrase50%
+
bdd-given-when-then/application-anchor50%
+
bdd-given-when-then/recognition0%
+
ears-requirements/application-anchor75%
+
ears-requirements/application-paraphrase75%
+
feynman-technique/application-anchor75%
+
feynman-technique/application-paraphrase75%
+
github-flow/application-paraphrase75%
+
iec-61508-sil-levels/application-anchor75%
+
iec-61508-sil-levels/application-paraphrase75%
+
lasr/recognition50%
+
madr/recognition50%
+
prd/recognition50%
+
property-based-testing/application-anchor75%
+
tdd-chicago-school/recognition75%
+
tdd-london-school/consistency-variant-325%
+
todotxt-flavoured-markdown/recognition50%
+
+

gpt-4o: 13 failures

control-chart-shewhart/application-paraphrase75%
ears-requirements/application-paraphrase75%
@@ -355,7 +393,7 @@

GPT-4o: 13 failures

tdd-london-school/consistency-variant-350%
tdd-london-school/recognition75%
-

Mistral Large: 17 failures

+

mistral-large-2512: 17 failures

adr-according-to-nygard/recognition75%
bdd-given-when-then/application-paraphrase50%
@@ -375,11 +413,209 @@

Mistral Large: 17 failures

semantic-versioning/application-paraphrase75%
todotxt-flavoured-markdown/recognition50%
+

mistral-medium-2508: 77 failures

+
+
adr-according-to-nygard/application-anchor50%
+
adr-according-to-nygard/application-paraphrase25%
+
adr-according-to-nygard/recognition75%
+
arc42/application-anchor75%
+
arc42/application-paraphrase50%
+
arc42/consistency-language0%
+
arc42/consistency-variant-125%
+
arc42/consistency-variant-250%
+
arc42/consistency-variant-350%
+
arc42/recognition75%
+
atam/application-anchor50%
+
atam/application-paraphrase50%
+
atam/recognition75%
+
bdd-given-when-then/application-anchor50%
+
bdd-given-when-then/application-paraphrase25%
+
bdd-given-when-then/recognition25%
+
bem-methodology/application-anchor50%
+
bem-methodology/application-paraphrase25%
+
bem-methodology/recognition50%
+
bluf/application-anchor75%
+
bluf/application-paraphrase75%
+
bluf/recognition75%
+
c4-diagrams/application-anchor50%
+
c4-diagrams/application-paraphrase0%
+
c4-diagrams/recognition25%
+
chain-of-thought/application-anchor50%
+
chain-of-thought/application-paraphrase75%
+
chain-of-thought/recognition50%
+
control-chart-shewhart/application-anchor75%
+
cqrs/application-anchor75%
+
cynefin-framework/application-anchor75%
+
definition-of-done/application-paraphrase75%
+
definition-of-done/recognition75%
+
diataxis-framework/recognition75%
+
docs-as-code/application-paraphrase75%
+
event-driven-architecture/application-paraphrase75%
+
event-driven-architecture/recognition75%
+
fagan-inspection/application-anchor75%
+
fagan-inspection/application-paraphrase75%
+
feynman-technique/application-paraphrase75%
+
fowler-patterns/application-anchor75%
+
gutes-deutsch-wolf-schneider/application-anchor75%
+
gutes-deutsch-wolf-schneider/recognition75%
+
hexagonal-architecture/application-paraphrase75%
+
iec-61508-sil-levels/recognition75%
+
jobs-to-be-done/recognition75%
+
linddun/application-paraphrase75%
+
madr/recognition25%
+
mece/application-anchor75%
+
morphological-box/application-anchor75%
+
moscow/application-paraphrase75%
+
mutation-testing/application-anchor75%
+
nelson-rules/application-anchor75%
+
owasp-top-10/application-anchor75%
+
prd/recognition50%
+
property-based-testing/recognition75%
+
pyramid-principle/application-anchor75%
+
semantic-versioning/application-anchor75%
+
semantic-versioning/application-paraphrase75%
+
socratic-method/application-anchor75%
+
sota/application-anchor75%
+
sota/application-paraphrase75%
+
spc/application-anchor75%
+
stride/recognition75%
+
swot/application-paraphrase75%
+
swot/recognition75%
+
tdd-london-school/application-paraphrase75%
+
tdd-london-school/consistency-language75%
+
tdd-london-school/consistency-variant-175%
+
tdd-london-school/consistency-variant-30%
+
tdd-london-school/recognition75%
+
testing-pyramid/application-paraphrase75%
+
testing-pyramid/recognition75%
+
timtowtdi/application-paraphrase75%
+
timtowtdi/recognition75%
+
todotxt-flavoured-markdown/recognition75%
+
user-story-mapping/recognition75%
+
+

mistral-small-2603: 115 failures

+
+
adr-according-to-nygard/application-anchor75%
+
adr-according-to-nygard/recognition75%
+
atam/application-anchor75%
+
bdd-given-when-then/application-paraphrase75%
+
conventional-commits/application-anchor50%
+
conventional-commits/application-paraphrase75%
+
cqrs/application-anchor75%
+
cqrs/application-paraphrase50%
+
definition-of-done/application-anchor75%
+
definition-of-done/application-paraphrase50%
+
definition-of-done/recognition75%
+
devils-advocate/application-anchor75%
+
devils-advocate/application-paraphrase50%
+
devils-advocate/recognition50%
+
diataxis-framework/application-anchor75%
+
diataxis-framework/application-paraphrase75%
+
diataxis-framework/recognition25%
+
docs-as-code/application-paraphrase50%
+
domain-driven-design/application-paraphrase50%
+
domain-driven-design/recognition50%
+
ears-requirements/application-anchor0%
+
ears-requirements/application-paraphrase50%
+
ears-requirements/recognition0%
+
event-driven-architecture/application-anchor75%
+
event-driven-architecture/application-paraphrase75%
+
event-driven-architecture/recognition75%
+
fagan-inspection/application-anchor25%
+
fagan-inspection/application-paraphrase75%
+
fagan-inspection/recognition75%
+
feynman-technique/application-anchor50%
+
feynman-technique/application-paraphrase75%
+
five-whys/application-anchor75%
+
fowler-patterns/application-anchor75%
+
fowler-patterns/recognition50%
+
gherkin/application-anchor75%
+
gherkin/recognition75%
+
github-flow/application-anchor50%
+
github-flow/application-paraphrase0%
+
github-flow/recognition75%
+
gutes-deutsch-wolf-schneider/recognition75%
+
hexagonal-architecture/recognition75%
+
iec-61508-sil-levels/application-anchor0%
+
iec-61508-sil-levels/application-paraphrase0%
+
impact-mapping/application-anchor50%
+
impact-mapping/application-paraphrase25%
+
impact-mapping/recognition75%
+
invest/recognition25%
+
iso-25010/application-anchor50%
+
iso-25010/application-paraphrase75%
+
jobs-to-be-done/application-anchor50%
+
lasr/application-anchor50%
+
lasr/application-paraphrase75%
+
lasr/recognition50%
+
linddun/application-paraphrase50%
+
llm-evaluations/application-anchor75%
+
llm-evaluations/recognition75%
+
madr/application-anchor50%
+
madr/application-paraphrase75%
+
madr/recognition75%
+
mece/application-anchor50%
+
morphological-box/application-anchor25%
+
morphological-box/application-paraphrase75%
+
moscow/application-paraphrase50%
+
moscow/recognition75%
+
mutation-testing/application-anchor75%
+
mutation-testing/application-paraphrase25%
+
mutation-testing/recognition75%
+
nelson-rules/application-anchor50%
+
owasp-top-10/application-anchor75%
+
owasp-top-10/application-paraphrase50%
+
owasp-top-10/recognition75%
+
plain-english-strunk-white/application-anchor50%
+
plain-english-strunk-white/application-paraphrase75%
+
plain-english-strunk-white/recognition50%
+
prd/application-anchor50%
+
prd/recognition25%
+
problem-space-nvc/application-anchor25%
+
problem-space-nvc/application-paraphrase50%
+
problem-space-nvc/recognition75%
+
property-based-testing/application-anchor25%
+
property-based-testing/recognition75%
+
pyramid-principle/application-anchor75%
+
pyramid-principle/application-paraphrase75%
+
semantic-versioning/application-paraphrase75%
+
socratic-method/application-paraphrase75%
+
socratic-method/recognition25%
+
sota/application-anchor50%
+
sota/application-paraphrase75%
+
spc/application-anchor25%
+
spc/application-paraphrase75%
+
spc/recognition50%
+
stride/application-anchor50%
+
stride/recognition50%
+
swot/application-anchor25%
+
swot/application-paraphrase75%
+
tdd-chicago-school/application-paraphrase50%
+
tdd-chicago-school/recognition50%
+
tdd-london-school/application-anchor50%
+
tdd-london-school/application-paraphrase75%
+
tdd-london-school/consistency-language50%
+
tdd-london-school/consistency-variant-150%
+
tdd-london-school/consistency-variant-275%
+
tdd-london-school/consistency-variant-325%
+
tdd-london-school/recognition50%
+
testing-pyramid/application-paraphrase75%
+
timtowtdi/application-anchor75%
+
timtowtdi/recognition75%
+
todotxt-flavoured-markdown/application-anchor75%
+
todotxt-flavoured-markdown/application-paraphrase75%
+
todotxt-flavoured-markdown/recognition75%
+
user-story-mapping/application-paraphrase50%
+
user-story-mapping/recognition75%
+
wardley-mapping/application-anchor50%
+
wardley-mapping/application-paraphrase25%
+
wardley-mapping/recognition50%
+

Run Metadata

-
Claude Sonnet:
pilot-20260324-174404.json · 81m 2s · 2026-03-24T17:44:04

GPT-4o:
pilot-20260324-192413.json · 15m 38s · 2026-03-24T19:24:13

Mistral Large:
pilot-20260324-190600.json · 16m 58s · 2026-03-24T19:06:00

+
claude-sonnet-4-20250514:
pilot-20260324-174404.json · 81m 2s · 2026-03-24T17:44:04

devstral-2512:
pilot-20260326-073241_devstral-2512.json · 15m 19s · 2026-03-26T07:32:41

gpt-4o:
pilot-20260324-192413.json · 15m 38s · 2026-03-24T19:24:13

mistral-large-2512:
pilot-20260324-190600.json · 16m 58s · 2026-03-24T19:06:00

mistral-medium-2508:
pilot-20260326-070127_mistral-medium-2508.json · 31m 12s · 2026-03-26T07:01:27

mistral-small-2603:
pilot-20260326-074132_mistral-small-2603.json · 18m 9s · 2026-03-26T07:41:32

Generated by evaluations/generate-report.py · Position bias mitigation: 4 permutations per question · Scoring: deterministic MC (no LLM judge)