\n"
@@ -262,7 +281,7 @@ def generate_html(results, output_path):
for m in model_names:
info = results[m]
dur = info["duration"]
- html += f"
"
html += """
diff --git a/evaluations/pilot.py b/evaluations/pilot.py
index c921ad0..bfc640d 100644
--- a/evaluations/pilot.py
+++ b/evaluations/pilot.py
@@ -312,7 +312,16 @@ def run_pilot(models, dry_run=False, verbose=False, ollama_model="qwen3:4b", no_
print()
ts = datetime.now().strftime("%Y%m%d-%H%M%S")
- out_file = RESULTS_DIR / f"pilot-{ts}.json"
+ # Include exact model IDs in filename to prevent race conditions
+ model_ids = []
+ for m in models:
+ if m == "openai": model_ids.append(openai_model)
+ elif m == "mistral": model_ids.append(mistral_model)
+ elif m == "deepseek": model_ids.append(deepseek_model)
+ elif m == "ollama": model_ids.append(f"ollama-{ollama_model}")
+ else: model_ids.append(m)
+ model_suffix = "_".join(model_ids).replace(":", "-").replace("/", "-")
+ out_file = RESULTS_DIR / f"pilot-{ts}_{model_suffix}.json"
all_results = {
"timestamp": datetime.now(timezone.utc).isoformat(),
@@ -449,6 +458,18 @@ def append_and_save(r):
save_results(all_results, out_file)
print(f"\nResults saved to {out_file}")
+ # Also save a stripped summary (scores only, no raw responses)
+ summary_dir = RESULTS_DIR.parent / "summaries"
+ summary_dir.mkdir(parents=True, exist_ok=True)
+ summary = json.loads(json.dumps(all_results)) # deep copy
+ for m_results in summary.get("models", {}).values():
+ for r in m_results:
+ r.pop("results", None)
+ summary_file = summary_dir / out_file.name
+ with open(summary_file, "w", encoding="utf-8") as fh:
+ json.dump(summary, fh, indent=2, ensure_ascii=False)
+ print(f"Summary saved to {summary_file}")
+
# Summary
print("\n=== SUMMARY ===")
print(f"Models: {', '.join(models)}")
diff --git a/evaluations/report.html b/evaluations/report.html
index 0fd9fb2..584a172 100644
--- a/evaluations/report.html
+++ b/evaluations/report.html
@@ -37,7 +37,7 @@
@@ -48,298 +48,336 @@
Semantic Anchor Evaluation Report
Model Summary
-
Claude Sonnet
+
claude-sonnet-4-20250514
99%
191 questions · pilot-20260324-174404.json
-
GPT-4o
+
devstral-2512
+
96%
+
191 questions · pilot-20260326-073241_devstral-2512.json
+
+
+
gpt-4o
98%
191 questions · pilot-20260324-192413.json
-
Mistral Large
+
mistral-large-2512
96%
191 questions · pilot-20260324-190600.json
+
+
mistral-medium-2508
+
85%
+
191 questions · pilot-20260326-070127_mistral-medium-2508.json
+
+
+
mistral-small-2603
+
74%
+
191 questions · pilot-20260326-074132_mistral-small-2603.json
+
Heatmap: Anchor × Model
| Anchor / Question |
- Claude Sonnet |
- GPT-4o |
- Mistral Large |
+ claude-sonnet-4-20250514 |
+ devstral-2512 |
+ gpt-4o |
+ mistral-large-2512 |
+ mistral-medium-2508 |
+ mistral-small-2603 |
-| adr-according-to-nygard | ✓ | ✓ | 92% |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | 75% |
-| arc42 | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| consistency-language | ✓ | ✓ | ✓ |
-| consistency-variant-1 | ✓ | ✓ | ✓ |
-| consistency-variant-2 | ✓ | ✓ | ✓ |
-| consistency-variant-3 | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| atam | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| bdd-given-when-then | ✓ | ✓ | 83% |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | 50% |
-| recognition | ✓ | ✓ | ✓ |
-| bem-methodology | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| bluf | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| c4-diagrams | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| chain-of-thought | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| clean-architecture | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| control-chart-shewhart | ✓ | 92% | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | 75% | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| conventional-commits | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| cqrs | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| cynefin-framework | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| definition-of-done | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| devils-advocate | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| diataxis-framework | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| docs-as-code | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| domain-driven-design | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| ears-requirements | ✓ | 92% | 83% |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | 75% | 75% |
-| recognition | ✓ | ✓ | 75% |
-| event-driven-architecture | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| fagan-inspection | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| feynman-technique | 67% | 67% | 92% |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | 0% | 0% | 75% |
-| recognition | ✓ | ✓ | ✓ |
-| five-whys | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| fowler-patterns | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| gherkin | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| github-flow | 92% | 92% | 92% |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | 75% | 75% | 75% |
-| recognition | ✓ | ✓ | ✓ |
-| gutes-deutsch-wolf-schneider | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| hexagonal-architecture | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| iec-61508-sil-levels | ✓ | 92% | 83% |
-| application-anchor | ✓ | ✓ | 50% |
-| application-paraphrase | ✓ | 75% | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| impact-mapping | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| invest | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| iso-25010 | ✓ | ✓ | 83% |
-| application-anchor | ✓ | ✓ | 75% |
-| application-paraphrase | ✓ | ✓ | 75% |
-| recognition | ✓ | ✓ | ✓ |
-| jobs-to-be-done | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| lasr | ✓ | 92% | 75% |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | 75% | 25% |
-| linddun | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| llm-evaluations | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| madr | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| mece | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| morphological-box | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| moscow | ✓ | 92% | 75% |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | 75% | 25% |
-| recognition | ✓ | ✓ | ✓ |
-| mutation-testing | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| nelson-rules | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| owasp-top-10 | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| plain-english-strunk-white | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| prd | ✓ | 92% | 67% |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | 75% | 0% |
-| problem-space-nvc | ✓ | ✓ | 83% |
-| application-anchor | ✓ | ✓ | 75% |
-| application-paraphrase | ✓ | ✓ | 75% |
-| recognition | ✓ | ✓ | ✓ |
-| property-based-testing | ✓ | 83% | ✓ |
-| application-anchor | ✓ | 75% | ✓ |
-| application-paraphrase | ✓ | 75% | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| pyramid-principle | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| semantic-versioning | ✓ | ✓ | 75% |
-| application-anchor | ✓ | ✓ | 50% |
-| application-paraphrase | ✓ | ✓ | 75% |
-| recognition | ✓ | ✓ | ✓ |
-| socratic-method | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| sota | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| spc | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| stride | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| swot | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| tdd-chicago-school | ✓ | 92% | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | 75% | ✓ |
-| tdd-london-school | ✓ | 89% | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| consistency-language | ✓ | ✓ | ✓ |
-| consistency-variant-1 | ✓ | ✓ | ✓ |
-| consistency-variant-2 | ✓ | ✓ | ✓ |
-| consistency-variant-3 | ✓ | 50% | ✓ |
-| recognition | ✓ | 75% | ✓ |
-| testing-pyramid | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| timtowtdi | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| todotxt-flavoured-markdown | ✓ | ✓ | 83% |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | 50% |
-| user-story-mapping | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| wardley-mapping | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
+| adr-according-to-nygard | ✓ | ✓ | ✓ | 92% | 50% | 83% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 50% | 75% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 25% | ✓ |
+| recognition | ✓ | ✓ | ✓ | 75% | 75% | 75% |
+| arc42 | ✓ | ✓ | ✓ | ✓ | 46% | ✓ |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 50% | ✓ |
+| consistency-language | ✓ | ✓ | ✓ | ✓ | 0% | ✓ |
+| consistency-variant-1 | ✓ | ✓ | ✓ | ✓ | 25% | ✓ |
+| consistency-variant-2 | ✓ | ✓ | ✓ | ✓ | 50% | ✓ |
+| consistency-variant-3 | ✓ | ✓ | ✓ | ✓ | 50% | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| atam | ✓ | 83% | ✓ | ✓ | 58% | 92% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 50% | 75% |
+| application-paraphrase | ✓ | 50% | ✓ | ✓ | 50% | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| bdd-given-when-then | ✓ | 50% | ✓ | 83% | 33% | 92% |
+| application-anchor | ✓ | 50% | ✓ | ✓ | 50% | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | 50% | 25% | 75% |
+| recognition | ✓ | 0% | ✓ | ✓ | 25% | ✓ |
+| bem-methodology | ✓ | ✓ | ✓ | ✓ | 42% | ✓ |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 50% | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 25% | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | 50% | ✓ |
+| bluf | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| c4-diagrams | ✓ | ✓ | ✓ | ✓ | 25% | ✓ |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 50% | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 0% | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | 25% | ✓ |
+| chain-of-thought | ✓ | ✓ | ✓ | ✓ | 58% | ✓ |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 50% | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | 50% | ✓ |
+| clean-architecture | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| control-chart-shewhart | ✓ | ✓ | 92% | ✓ | 92% | ✓ |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| application-paraphrase | ✓ | ✓ | 75% | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| conventional-commits | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| cqrs | ✓ | ✓ | ✓ | ✓ | 92% | 75% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| cynefin-framework | ✓ | ✓ | ✓ | ✓ | 92% | ✓ |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| definition-of-done | ✓ | ✓ | ✓ | ✓ | 83% | 67% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 75% | 50% |
+| recognition | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| devils-advocate | ✓ | ✓ | ✓ | ✓ | ✓ | 58% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| diataxis-framework | ✓ | ✓ | ✓ | ✓ | 92% | 58% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| recognition | ✓ | ✓ | ✓ | ✓ | 75% | 25% |
+| docs-as-code | ✓ | ✓ | ✓ | ✓ | 92% | 83% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 75% | 50% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| domain-driven-design | ✓ | ✓ | ✓ | ✓ | ✓ | 67% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| ears-requirements | ✓ | 83% | 92% | 83% | ✓ | 17% |
+| application-anchor | ✓ | 75% | ✓ | ✓ | ✓ | 0% |
+| application-paraphrase | ✓ | 75% | 75% | 75% | ✓ | 50% |
+| recognition | ✓ | ✓ | ✓ | 75% | ✓ | 0% |
+| event-driven-architecture | ✓ | ✓ | ✓ | ✓ | 83% | 75% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| recognition | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| fagan-inspection | ✓ | ✓ | ✓ | ✓ | 83% | 58% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | 25% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| feynman-technique | 67% | 83% | 67% | 92% | 92% | 75% |
+| application-anchor | ✓ | 75% | ✓ | ✓ | ✓ | 50% |
+| application-paraphrase | 0% | 75% | 0% | 75% | 75% | 75% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| five-whys | ✓ | ✓ | ✓ | ✓ | ✓ | 92% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| fowler-patterns | ✓ | ✓ | ✓ | ✓ | 92% | 75% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| gherkin | ✓ | ✓ | ✓ | ✓ | ✓ | 83% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| github-flow | 92% | 92% | 92% | 92% | ✓ | 42% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| application-paraphrase | 75% | 75% | 75% | 75% | ✓ | 0% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| gutes-deutsch-wolf-schneider | ✓ | ✓ | ✓ | ✓ | 83% | 92% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| hexagonal-architecture | ✓ | ✓ | ✓ | ✓ | 92% | 92% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| iec-61508-sil-levels | ✓ | 83% | 92% | 83% | 92% | 33% |
+| application-anchor | ✓ | 75% | ✓ | 50% | ✓ | 0% |
+| application-paraphrase | ✓ | 75% | 75% | ✓ | ✓ | 0% |
+| recognition | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| impact-mapping | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 25% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| invest | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 25% |
+| iso-25010 | ✓ | ✓ | ✓ | 83% | ✓ | 75% |
+| application-anchor | ✓ | ✓ | ✓ | 75% | ✓ | 50% |
+| application-paraphrase | ✓ | ✓ | ✓ | 75% | ✓ | 75% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| jobs-to-be-done | ✓ | ✓ | ✓ | ✓ | 92% | 83% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| lasr | ✓ | 83% | 92% | 75% | ✓ | 58% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| recognition | ✓ | 50% | 75% | 25% | ✓ | 50% |
+| linddun | ✓ | ✓ | ✓ | ✓ | 92% | 83% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 75% | 50% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| llm-evaluations | ✓ | ✓ | ✓ | ✓ | ✓ | 83% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| madr | ✓ | 83% | ✓ | ✓ | 75% | 67% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| recognition | ✓ | 50% | ✓ | ✓ | 25% | 75% |
+| mece | ✓ | ✓ | ✓ | ✓ | 92% | 83% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | 50% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| morphological-box | ✓ | ✓ | ✓ | ✓ | 92% | 67% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | 25% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| moscow | ✓ | ✓ | 92% | 75% | 92% | 75% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | 75% | 25% | 75% | 50% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| mutation-testing | ✓ | ✓ | ✓ | ✓ | 92% | 58% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 25% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| nelson-rules | ✓ | ✓ | ✓ | ✓ | 92% | 83% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | 50% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| owasp-top-10 | ✓ | ✓ | ✓ | ✓ | 92% | 67% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| plain-english-strunk-white | ✓ | ✓ | ✓ | ✓ | ✓ | 58% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| prd | ✓ | 83% | 92% | 67% | 83% | 58% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| recognition | ✓ | 50% | 75% | 0% | 50% | 25% |
+| problem-space-nvc | ✓ | ✓ | ✓ | 83% | ✓ | 50% |
+| application-anchor | ✓ | ✓ | ✓ | 75% | ✓ | 25% |
+| application-paraphrase | ✓ | ✓ | ✓ | 75% | ✓ | 50% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| property-based-testing | ✓ | 92% | 83% | ✓ | 92% | 67% |
+| application-anchor | ✓ | 75% | 75% | ✓ | ✓ | 25% |
+| application-paraphrase | ✓ | ✓ | 75% | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| pyramid-principle | ✓ | ✓ | ✓ | ✓ | 92% | 83% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| semantic-versioning | ✓ | ✓ | ✓ | 75% | 83% | 92% |
+| application-anchor | ✓ | ✓ | ✓ | 50% | 75% | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | 75% | 75% | 75% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| socratic-method | ✓ | ✓ | ✓ | ✓ | 92% | 67% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 25% |
+| sota | ✓ | ✓ | ✓ | ✓ | 83% | 75% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | 50% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| spc | ✓ | ✓ | ✓ | ✓ | 92% | 50% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | 25% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| stride | ✓ | ✓ | ✓ | ✓ | 92% | 67% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | 75% | 50% |
+| swot | ✓ | ✓ | ✓ | ✓ | 83% | 67% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 25% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| recognition | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| tdd-chicago-school | ✓ | 92% | 92% | ✓ | ✓ | 67% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| recognition | ✓ | 75% | 75% | ✓ | ✓ | 50% |
+| tdd-london-school | ✓ | 89% | 89% | ✓ | 71% | 54% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| consistency-language | ✓ | ✓ | ✓ | ✓ | 75% | 50% |
+| consistency-variant-1 | ✓ | ✓ | ✓ | ✓ | 75% | 50% |
+| consistency-variant-2 | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| consistency-variant-3 | ✓ | 25% | 50% | ✓ | 0% | 25% |
+| recognition | ✓ | ✓ | 75% | ✓ | 75% | 50% |
+| testing-pyramid | ✓ | ✓ | ✓ | ✓ | 83% | 92% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| recognition | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| timtowtdi | ✓ | ✓ | ✓ | ✓ | 83% | 83% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| todotxt-flavoured-markdown | ✓ | 83% | ✓ | 83% | 92% | 75% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| recognition | ✓ | 50% | ✓ | 50% | 75% | 75% |
+| user-story-mapping | ✓ | ✓ | ✓ | ✓ | 92% | 75% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| recognition | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| wardley-mapping | ✓ | ✓ | ✓ | ✓ | ✓ | 42% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 25% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
Control Questions
-| Control | Claude Sonnet | GPT-4o | Mistral Large |
+| Control | claude-sonnet-4-20250514 | devstral-2512 | gpt-4o | mistral-large-2512 | mistral-medium-2508 | mistral-small-2603 |
-| negative-control | 100% | 100% | 75% |
-| sanity-check | 0% | 0% | 0% |
+| negative-control | 100% | 75% | 100% | 75% | 100% | 50% |
+| sanity-check | 0% | 0% | 0% | 0% | 0% | 0% |
Failures Detail
-
Claude Sonnet: 2 failures
+
claude-sonnet-4-20250514: 2 failures
feynman-technique/application-paraphrase0%
github-flow/application-paraphrase75%
-
GPT-4o: 13 failures
+
devstral-2512: 17 failures
+
+
atam/application-paraphrase50%
+
bdd-given-when-then/application-anchor50%
+
bdd-given-when-then/recognition0%
+
ears-requirements/application-anchor75%
+
ears-requirements/application-paraphrase75%
+
feynman-technique/application-anchor75%
+
feynman-technique/application-paraphrase75%
+
github-flow/application-paraphrase75%
+
iec-61508-sil-levels/application-anchor75%
+
iec-61508-sil-levels/application-paraphrase75%
+
lasr/recognition50%
+
madr/recognition50%
+
prd/recognition50%
+
property-based-testing/application-anchor75%
+
tdd-chicago-school/recognition75%
+
tdd-london-school/consistency-variant-325%
+
todotxt-flavoured-markdown/recognition50%
+
+
gpt-4o: 13 failures
control-chart-shewhart/application-paraphrase75%
ears-requirements/application-paraphrase75%
@@ -355,7 +393,7 @@
GPT-4o: 13 failures
tdd-london-school/consistency-variant-350%
tdd-london-school/recognition75%
-
Mistral Large: 17 failures
+
mistral-large-2512: 17 failures
adr-according-to-nygard/recognition75%
bdd-given-when-then/application-paraphrase50%
@@ -375,11 +413,209 @@
Mistral Large: 17 failures
semantic-versioning/application-paraphrase75%
todotxt-flavoured-markdown/recognition50%
+
mistral-medium-2508: 77 failures
+
+
adr-according-to-nygard/application-anchor50%
+
adr-according-to-nygard/application-paraphrase25%
+
adr-according-to-nygard/recognition75%
+
arc42/application-anchor75%
+
arc42/application-paraphrase50%
+
arc42/consistency-language0%
+
arc42/consistency-variant-125%
+
arc42/consistency-variant-250%
+
arc42/consistency-variant-350%
+
arc42/recognition75%
+
atam/application-anchor50%
+
atam/application-paraphrase50%
+
atam/recognition75%
+
bdd-given-when-then/application-anchor50%
+
bdd-given-when-then/application-paraphrase25%
+
bdd-given-when-then/recognition25%
+
bem-methodology/application-anchor50%
+
bem-methodology/application-paraphrase25%
+
bem-methodology/recognition50%
+
bluf/application-anchor75%
+
bluf/application-paraphrase75%
+
bluf/recognition75%
+
c4-diagrams/application-anchor50%
+
c4-diagrams/application-paraphrase0%
+
c4-diagrams/recognition25%
+
chain-of-thought/application-anchor50%
+
chain-of-thought/application-paraphrase75%
+
chain-of-thought/recognition50%
+
control-chart-shewhart/application-anchor75%
+
cqrs/application-anchor75%
+
cynefin-framework/application-anchor75%
+
definition-of-done/application-paraphrase75%
+
definition-of-done/recognition75%
+
diataxis-framework/recognition75%
+
docs-as-code/application-paraphrase75%
+
event-driven-architecture/application-paraphrase75%
+
event-driven-architecture/recognition75%
+
fagan-inspection/application-anchor75%
+
fagan-inspection/application-paraphrase75%
+
feynman-technique/application-paraphrase75%
+
fowler-patterns/application-anchor75%
+
gutes-deutsch-wolf-schneider/application-anchor75%
+
gutes-deutsch-wolf-schneider/recognition75%
+
hexagonal-architecture/application-paraphrase75%
+
iec-61508-sil-levels/recognition75%
+
jobs-to-be-done/recognition75%
+
linddun/application-paraphrase75%
+
madr/recognition25%
+
mece/application-anchor75%
+
morphological-box/application-anchor75%
+
moscow/application-paraphrase75%
+
mutation-testing/application-anchor75%
+
nelson-rules/application-anchor75%
+
owasp-top-10/application-anchor75%
+
prd/recognition50%
+
property-based-testing/recognition75%
+
pyramid-principle/application-anchor75%
+
semantic-versioning/application-anchor75%
+
semantic-versioning/application-paraphrase75%
+
socratic-method/application-anchor75%
+
sota/application-anchor75%
+
sota/application-paraphrase75%
+
spc/application-anchor75%
+
stride/recognition75%
+
swot/application-paraphrase75%
+
swot/recognition75%
+
tdd-london-school/application-paraphrase75%
+
tdd-london-school/consistency-language75%
+
tdd-london-school/consistency-variant-175%
+
tdd-london-school/consistency-variant-30%
+
tdd-london-school/recognition75%
+
testing-pyramid/application-paraphrase75%
+
testing-pyramid/recognition75%
+
timtowtdi/application-paraphrase75%
+
timtowtdi/recognition75%
+
todotxt-flavoured-markdown/recognition75%
+
user-story-mapping/recognition75%
+
+
mistral-small-2603: 115 failures
+
+
adr-according-to-nygard/application-anchor75%
+
adr-according-to-nygard/recognition75%
+
atam/application-anchor75%
+
bdd-given-when-then/application-paraphrase75%
+
conventional-commits/application-anchor50%
+
conventional-commits/application-paraphrase75%
+
cqrs/application-anchor75%
+
cqrs/application-paraphrase50%
+
definition-of-done/application-anchor75%
+
definition-of-done/application-paraphrase50%
+
definition-of-done/recognition75%
+
devils-advocate/application-anchor75%
+
devils-advocate/application-paraphrase50%
+
devils-advocate/recognition50%
+
diataxis-framework/application-anchor75%
+
diataxis-framework/application-paraphrase75%
+
diataxis-framework/recognition25%
+
docs-as-code/application-paraphrase50%
+
domain-driven-design/application-paraphrase50%
+
domain-driven-design/recognition50%
+
ears-requirements/application-anchor0%
+
ears-requirements/application-paraphrase50%
+
ears-requirements/recognition0%
+
event-driven-architecture/application-anchor75%
+
event-driven-architecture/application-paraphrase75%
+
event-driven-architecture/recognition75%
+
fagan-inspection/application-anchor25%
+
fagan-inspection/application-paraphrase75%
+
fagan-inspection/recognition75%
+
feynman-technique/application-anchor50%
+
feynman-technique/application-paraphrase75%
+
five-whys/application-anchor75%
+
fowler-patterns/application-anchor75%
+
fowler-patterns/recognition50%
+
gherkin/application-anchor75%
+
gherkin/recognition75%
+
github-flow/application-anchor50%
+
github-flow/application-paraphrase0%
+
github-flow/recognition75%
+
gutes-deutsch-wolf-schneider/recognition75%
+
hexagonal-architecture/recognition75%
+
iec-61508-sil-levels/application-anchor0%
+
iec-61508-sil-levels/application-paraphrase0%
+
impact-mapping/application-anchor50%
+
impact-mapping/application-paraphrase25%
+
impact-mapping/recognition75%
+
invest/recognition25%
+
iso-25010/application-anchor50%
+
iso-25010/application-paraphrase75%
+
jobs-to-be-done/application-anchor50%
+
lasr/application-anchor50%
+
lasr/application-paraphrase75%
+
lasr/recognition50%
+
linddun/application-paraphrase50%
+
llm-evaluations/application-anchor75%
+
llm-evaluations/recognition75%
+
madr/application-anchor50%
+
madr/application-paraphrase75%
+
madr/recognition75%
+
mece/application-anchor50%
+
morphological-box/application-anchor25%
+
morphological-box/application-paraphrase75%
+
moscow/application-paraphrase50%
+
moscow/recognition75%
+
mutation-testing/application-anchor75%
+
mutation-testing/application-paraphrase25%
+
mutation-testing/recognition75%
+
nelson-rules/application-anchor50%
+
owasp-top-10/application-anchor75%
+
owasp-top-10/application-paraphrase50%
+
owasp-top-10/recognition75%
+
plain-english-strunk-white/application-anchor50%
+
plain-english-strunk-white/application-paraphrase75%
+
plain-english-strunk-white/recognition50%
+
prd/application-anchor50%
+
prd/recognition25%
+
problem-space-nvc/application-anchor25%
+
problem-space-nvc/application-paraphrase50%
+
problem-space-nvc/recognition75%
+
property-based-testing/application-anchor25%
+
property-based-testing/recognition75%
+
pyramid-principle/application-anchor75%
+
pyramid-principle/application-paraphrase75%
+
semantic-versioning/application-paraphrase75%
+
socratic-method/application-paraphrase75%
+
socratic-method/recognition25%
+
sota/application-anchor50%
+
sota/application-paraphrase75%
+
spc/application-anchor25%
+
spc/application-paraphrase75%
+
spc/recognition50%
+
stride/application-anchor50%
+
stride/recognition50%
+
swot/application-anchor25%
+
swot/application-paraphrase75%
+
tdd-chicago-school/application-paraphrase50%
+
tdd-chicago-school/recognition50%
+
tdd-london-school/application-anchor50%
+
tdd-london-school/application-paraphrase75%
+
tdd-london-school/consistency-language50%
+
tdd-london-school/consistency-variant-150%
+
tdd-london-school/consistency-variant-275%
+
tdd-london-school/consistency-variant-325%
+
tdd-london-school/recognition50%
+
testing-pyramid/application-paraphrase75%
+
timtowtdi/application-anchor75%
+
timtowtdi/recognition75%
+
todotxt-flavoured-markdown/application-anchor75%
+
todotxt-flavoured-markdown/application-paraphrase75%
+
todotxt-flavoured-markdown/recognition75%
+
user-story-mapping/application-paraphrase50%
+
user-story-mapping/recognition75%
+
wardley-mapping/application-anchor50%
+
wardley-mapping/application-paraphrase25%
+
wardley-mapping/recognition50%
+
diff --git a/evaluations/results/pilot-20260324-174404.json b/evaluations/results/pilot-20260324-174404.json
deleted file mode 100644
index 110f33a..0000000
--- a/evaluations/results/pilot-20260324-174404.json
+++ /dev/null
@@ -1,10442 +0,0 @@
-{
- "timestamp": "2026-03-24T17:44:04.891380+00:00",
- "config": {
- "models": [
- "claude"
- ],
- "openai_model": null,
- "mistral_model": null,
- "deepseek_model": null,
- "ollama_model": null,
- "ollama_url": null,
- "no_think": null,
- "temperature": 0.0
- },
- "models": {
- "claude": [
- {
- "label": "adr-according-to-nygard/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "adr-according-to-nygard/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "adr-according-to-nygard/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "arc42/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- }
- ]
- },
- {
- "label": "arc42/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "arc42/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "arc42/consistency-variant-1",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "arc42/consistency-variant-2",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "arc42/consistency-variant-3",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "arc42/consistency-language",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "atam/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "atam/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "atam/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "bdd-given-when-then/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "bdd-given-when-then/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "bdd-given-when-then/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "bem-methodology/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "bem-methodology/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "bem-methodology/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "bluf/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "bluf/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "bluf/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "c4-diagrams/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "c4-diagrams/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "c4-diagrams/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "chain-of-thought/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "chain-of-thought/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "chain-of-thought/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "clean-architecture/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "clean-architecture/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "clean-architecture/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "control-chart-shewhart/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "control-chart-shewhart/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "control-chart-shewhart/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "conventional-commits/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "conventional-commits/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "conventional-commits/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "cqrs/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "cqrs/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "cqrs/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "cynefin-framework/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "cynefin-framework/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "cynefin-framework/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "definition-of-done/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "definition-of-done/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "definition-of-done/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "devils-advocate/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "devils-advocate/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "devils-advocate/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "diataxis-framework/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "diataxis-framework/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "diataxis-framework/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "docs-as-code/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "docs-as-code/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "docs-as-code/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "domain-driven-design/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "domain-driven-design/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "domain-driven-design/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "ears-requirements/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "ears-requirements/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "ears-requirements/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "event-driven-architecture/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "event-driven-architecture/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "event-driven-architecture/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "fagan-inspection/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "fagan-inspection/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "fagan-inspection/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "feynman-technique/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "feynman-technique/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "feynman-technique/application-paraphrase",
- "score": 0.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "C",
- "correct": false,
- "raw_response": "C"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "B",
- "correct": false,
- "raw_response": "B"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "A",
- "correct": false,
- "raw_response": "A"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "D",
- "correct": false,
- "raw_response": "D"
- }
- ]
- },
- {
- "label": "five-whys/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "five-whys/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "five-whys/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "fowler-patterns/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "fowler-patterns/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "fowler-patterns/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "gherkin/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- }
- ]
- },
- {
- "label": "gherkin/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "gherkin/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "github-flow/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "github-flow/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "github-flow/application-paraphrase",
- "score": 0.75,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "A",
- "correct": false,
- "raw_response": "A"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "gutes-deutsch-wolf-schneider/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "gutes-deutsch-wolf-schneider/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "gutes-deutsch-wolf-schneider/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "hexagonal-architecture/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "hexagonal-architecture/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "hexagonal-architecture/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "iec-61508-sil-levels/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "iec-61508-sil-levels/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "iec-61508-sil-levels/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "impact-mapping/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "impact-mapping/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "impact-mapping/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "invest/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "invest/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "invest/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "iso-25010/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "iso-25010/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "iso-25010/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "jobs-to-be-done/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "jobs-to-be-done/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "jobs-to-be-done/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "lasr/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "lasr/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "lasr/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "linddun/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "linddun/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "linddun/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "llm-evaluations/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "llm-evaluations/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "llm-evaluations/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "madr/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "madr/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "madr/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "mece/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "mece/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "mece/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "morphological-box/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "morphological-box/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "morphological-box/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "moscow/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "moscow/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "moscow/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "mutation-testing/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "mutation-testing/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "mutation-testing/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "negative-control/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- }
- ]
- },
- {
- "label": "nelson-rules/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "nelson-rules/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "nelson-rules/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "owasp-top-10/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- }
- ]
- },
- {
- "label": "owasp-top-10/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "owasp-top-10/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "plain-english-strunk-white/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "plain-english-strunk-white/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "plain-english-strunk-white/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "prd/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "prd/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "prd/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "problem-space-nvc/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "problem-space-nvc/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "problem-space-nvc/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "property-based-testing/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "property-based-testing/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "property-based-testing/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "pyramid-principle/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "pyramid-principle/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "pyramid-principle/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "sanity-check/recognition",
- "score": 0.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "X",
- "answer": "B",
- "correct": false,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "X",
- "answer": null,
- "correct": false,
- "raw_response": "None of the options provided match the famous answer from"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "X",
- "answer": null,
- "correct": false,
- "raw_response": "None of the options provided match the correct answer from"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "X",
- "answer": null,
- "correct": false,
- "raw_response": "None of the options provided match the famous answer from"
- }
- ]
- },
- {
- "label": "semantic-versioning/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "semantic-versioning/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "semantic-versioning/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "socratic-method/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "socratic-method/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "socratic-method/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "sota/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "sota/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "sota/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "spc/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "spc/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "spc/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "stride/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "stride/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "stride/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "swot/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "swot/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "swot/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "tdd-chicago-school/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "tdd-chicago-school/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "tdd-chicago-school/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "tdd-london-school/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "tdd-london-school/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "tdd-london-school/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "tdd-london-school/consistency-variant-1",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "tdd-london-school/consistency-variant-2",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "tdd-london-school/consistency-variant-3",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "tdd-london-school/consistency-language",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "testing-pyramid/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "testing-pyramid/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "testing-pyramid/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "timtowtdi/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- }
- ]
- },
- {
- "label": "timtowtdi/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "timtowtdi/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "todotxt-flavoured-markdown/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "todotxt-flavoured-markdown/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "todotxt-flavoured-markdown/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "user-story-mapping/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "user-story-mapping/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "user-story-mapping/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "wardley-mapping/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "wardley-mapping/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "wardley-mapping/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- }
- ]
- },
- "duration_seconds": 4862.1
-}
\ No newline at end of file
diff --git a/evaluations/results/pilot-20260324-190600.json b/evaluations/results/pilot-20260324-190600.json
deleted file mode 100644
index 613af48..0000000
--- a/evaluations/results/pilot-20260324-190600.json
+++ /dev/null
@@ -1,10442 +0,0 @@
-{
- "timestamp": "2026-03-24T19:06:00.394684+00:00",
- "config": {
- "models": [
- "mistral"
- ],
- "openai_model": null,
- "mistral_model": "mistral-large-latest",
- "deepseek_model": null,
- "ollama_model": null,
- "ollama_url": null,
- "no_think": null,
- "temperature": 0.0
- },
- "models": {
- "mistral": [
- {
- "label": "adr-according-to-nygard/recognition",
- "score": 0.75,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "A",
- "correct": false,
- "raw_response": "A"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "adr-according-to-nygard/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "adr-according-to-nygard/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "arc42/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- }
- ]
- },
- {
- "label": "arc42/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "arc42/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "arc42/consistency-variant-1",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "arc42/consistency-variant-2",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "arc42/consistency-variant-3",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "arc42/consistency-language",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "atam/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "atam/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "atam/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "bdd-given-when-then/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "bdd-given-when-then/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "bdd-given-when-then/application-paraphrase",
- "score": 0.5,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": null,
- "correct": false,
- "raw_response": "ERROR: Error code: 429 - {'object': 'error', 'message': 'Rate limit exceeded', 'type': 'rate_limited', 'param': None, 'code': '1300', 'raw_status_code': 429}"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": null,
- "correct": false,
- "raw_response": "ERROR: Error code: 429 - {'object': 'error', 'message': 'Rate limit exceeded', 'type': 'rate_limited', 'param': None, 'code': '1300', 'raw_status_code': 429}"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "bem-methodology/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "bem-methodology/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "bem-methodology/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "bluf/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "bluf/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "bluf/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "c4-diagrams/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "c4-diagrams/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "c4-diagrams/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "chain-of-thought/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "chain-of-thought/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "chain-of-thought/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "clean-architecture/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "clean-architecture/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "clean-architecture/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "control-chart-shewhart/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "control-chart-shewhart/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "control-chart-shewhart/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "conventional-commits/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "conventional-commits/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "conventional-commits/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "cqrs/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "cqrs/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "cqrs/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "cynefin-framework/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "cynefin-framework/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "cynefin-framework/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "definition-of-done/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "definition-of-done/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "definition-of-done/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "devils-advocate/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "devils-advocate/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "devils-advocate/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "diataxis-framework/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "diataxis-framework/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "diataxis-framework/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "docs-as-code/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "docs-as-code/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "docs-as-code/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "domain-driven-design/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "domain-driven-design/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "domain-driven-design/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "ears-requirements/recognition",
- "score": 0.75,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "C",
- "correct": false,
- "raw_response": "C)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "ears-requirements/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "ears-requirements/application-paraphrase",
- "score": 0.75,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "B",
- "correct": false,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "event-driven-architecture/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "event-driven-architecture/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "event-driven-architecture/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "fagan-inspection/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "fagan-inspection/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "fagan-inspection/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "feynman-technique/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "feynman-technique/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "feynman-technique/application-paraphrase",
- "score": 0.75,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "B",
- "correct": false,
- "raw_response": "B"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "five-whys/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "five-whys/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "five-whys/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "fowler-patterns/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "fowler-patterns/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "fowler-patterns/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "gherkin/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- }
- ]
- },
- {
- "label": "gherkin/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "gherkin/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "github-flow/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "github-flow/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "github-flow/application-paraphrase",
- "score": 0.75,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "A",
- "correct": false,
- "raw_response": "A"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "gutes-deutsch-wolf-schneider/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "gutes-deutsch-wolf-schneider/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "gutes-deutsch-wolf-schneider/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "hexagonal-architecture/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "hexagonal-architecture/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "hexagonal-architecture/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "iec-61508-sil-levels/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "iec-61508-sil-levels/application-anchor",
- "score": 0.5,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "B",
- "correct": false,
- "raw_response": "B"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "D",
- "correct": false,
- "raw_response": "D"
- }
- ]
- },
- {
- "label": "iec-61508-sil-levels/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "impact-mapping/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "impact-mapping/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "impact-mapping/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "invest/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "invest/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "invest/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "iso-25010/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "iso-25010/application-anchor",
- "score": 0.75,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "B",
- "correct": false,
- "raw_response": "B"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "iso-25010/application-paraphrase",
- "score": 0.75,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "B",
- "correct": false,
- "raw_response": "B"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "jobs-to-be-done/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "jobs-to-be-done/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "jobs-to-be-done/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "lasr/recognition",
- "score": 0.25,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "A",
- "correct": false,
- "raw_response": "A"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "C",
- "correct": false,
- "raw_response": "C"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "A",
- "correct": false,
- "raw_response": "A"
- }
- ]
- },
- {
- "label": "lasr/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "lasr/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "linddun/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "linddun/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "linddun/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "llm-evaluations/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "llm-evaluations/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "llm-evaluations/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "madr/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "madr/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "madr/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "mece/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "mece/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "mece/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "morphological-box/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "morphological-box/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "morphological-box/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "moscow/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "moscow/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "moscow/application-paraphrase",
- "score": 0.25,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "A",
- "correct": false,
- "raw_response": "A"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "D",
- "correct": false,
- "raw_response": "D"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "B",
- "correct": false,
- "raw_response": "B"
- }
- ]
- },
- {
- "label": "mutation-testing/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "mutation-testing/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "mutation-testing/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "negative-control/recognition",
- "score": 0.75,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "D",
- "answer": "B",
- "correct": false,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- }
- ]
- },
- {
- "label": "nelson-rules/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "nelson-rules/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "nelson-rules/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "owasp-top-10/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- }
- ]
- },
- {
- "label": "owasp-top-10/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "owasp-top-10/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "plain-english-strunk-white/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "plain-english-strunk-white/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "plain-english-strunk-white/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "prd/recognition",
- "score": 0.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "D",
- "correct": false,
- "raw_response": "D"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "C",
- "correct": false,
- "raw_response": "C"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "B",
- "correct": false,
- "raw_response": "B"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "A",
- "correct": false,
- "raw_response": "A"
- }
- ]
- },
- {
- "label": "prd/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "prd/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "problem-space-nvc/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "problem-space-nvc/application-anchor",
- "score": 0.75,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "C",
- "correct": false,
- "raw_response": "C"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "problem-space-nvc/application-paraphrase",
- "score": 0.75,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "C",
- "correct": false,
- "raw_response": "C"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "property-based-testing/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "property-based-testing/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "property-based-testing/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "pyramid-principle/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "pyramid-principle/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "pyramid-principle/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "sanity-check/recognition",
- "score": 0.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "X",
- "answer": null,
- "correct": false,
- "raw_response": "None of the above options is correct, but the"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "X",
- "answer": null,
- "correct": false,
- "raw_response": "None of the options provided is correct, but the"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "X",
- "answer": null,
- "correct": false,
- "raw_response": "None of the options provided is correct, but the"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "X",
- "answer": "C",
- "correct": false,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "semantic-versioning/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "semantic-versioning/application-anchor",
- "score": 0.5,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "C",
- "correct": false,
- "raw_response": "C"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "D",
- "correct": false,
- "raw_response": "D"
- }
- ]
- },
- {
- "label": "semantic-versioning/application-paraphrase",
- "score": 0.75,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "D",
- "correct": false,
- "raw_response": "D"
- }
- ]
- },
- {
- "label": "socratic-method/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "socratic-method/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "socratic-method/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "sota/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "sota/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "sota/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "spc/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "spc/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "spc/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "stride/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "None of the options perfectly describe the **STRIDE"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "stride/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "stride/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "swot/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "swot/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A) SWOT Analysis\nB) Value Chain"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "swot/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "tdd-chicago-school/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "tdd-chicago-school/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "tdd-chicago-school/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "tdd-london-school/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "tdd-london-school/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "tdd-london-school/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "tdd-london-school/consistency-variant-1",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "tdd-london-school/consistency-variant-2",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "tdd-london-school/consistency-variant-3",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "tdd-london-school/consistency-language",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "testing-pyramid/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "testing-pyramid/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "testing-pyramid/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "timtowtdi/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- }
- ]
- },
- {
- "label": "timtowtdi/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "timtowtdi/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "todotxt-flavoured-markdown/recognition",
- "score": 0.5,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "A",
- "correct": false,
- "raw_response": "A"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "D",
- "correct": false,
- "raw_response": "D"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "todotxt-flavoured-markdown/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "todotxt-flavoured-markdown/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "user-story-mapping/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "user-story-mapping/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "user-story-mapping/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "wardley-mapping/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "wardley-mapping/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "wardley-mapping/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- }
- ]
- },
- "duration_seconds": 1018.4
-}
\ No newline at end of file
diff --git a/evaluations/results/pilot-20260324-192413.json b/evaluations/results/pilot-20260324-192413.json
deleted file mode 100644
index 8c5f85e..0000000
--- a/evaluations/results/pilot-20260324-192413.json
+++ /dev/null
@@ -1,10442 +0,0 @@
-{
- "timestamp": "2026-03-24T19:24:13.551875+00:00",
- "config": {
- "models": [
- "openai"
- ],
- "openai_model": "gpt-4o",
- "mistral_model": null,
- "deepseek_model": null,
- "ollama_model": null,
- "ollama_url": null,
- "no_think": null,
- "temperature": 0.0
- },
- "models": {
- "openai": [
- {
- "label": "adr-according-to-nygard/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "adr-according-to-nygard/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "adr-according-to-nygard/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "arc42/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) A 12-section template for standardized software"
- }
- ]
- },
- {
- "label": "arc42/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "arc42/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) External interfaces in Section 3 (Context"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "arc42/consistency-variant-1",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) Gernot Starke"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A) Gernot Starke"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D) Gernot Starke"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C) Gernot Starke"
- }
- ]
- },
- {
- "label": "arc42/consistency-variant-2",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) Gernot Starke"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A) Gernot Starke"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D) Gernot Starke"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C) Gernot Starke"
- }
- ]
- },
- {
- "label": "arc42/consistency-variant-3",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) Gernot Starke"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A) Gernot Starke"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D) Gernot Starke"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C) Gernot Starke"
- }
- ]
- },
- {
- "label": "arc42/consistency-language",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "atam/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "atam/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "atam/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) Build a utility tree to prioritize quality scenarios"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D) Build a utility tree to prioritize quality scenarios"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C) Build a utility tree to prioritize quality scenarios"
- }
- ]
- },
- {
- "label": "bdd-given-when-then/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "bdd-given-when-then/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "bdd-given-when-then/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D) Organize discovery workshops with the three amigos"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "bem-methodology/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "bem-methodology/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) Use structured class names like `.navbar`,"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C) Use structured class names like `.navbar`,"
- }
- ]
- },
- {
- "label": "bem-methodology/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) Use structured class names like `.navbar`,"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D) Use structured class names like `.navbar`,"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C) Use structured class names like `.navbar`,"
- }
- ]
- },
- {
- "label": "bluf/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "bluf/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "bluf/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "c4-diagrams/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A) Four levels of abstraction; : system in"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C) Four levels of abstraction; : system in"
- }
- ]
- },
- {
- "label": "c4-diagrams/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "c4-diagrams/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "chain-of-thought/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "chain-of-thought/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "chain-of-thought/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "clean-architecture/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "clean-architecture/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) Define payment processing use cases in the core"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "clean-architecture/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "control-chart-shewhart/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C) Measured value plotted over time; process"
- }
- ]
- },
- {
- "label": "control-chart-shewhart/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "control-chart-shewhart/application-paraphrase",
- "score": 0.75,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "A",
- "correct": false,
- "raw_response": "A"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "conventional-commits/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "conventional-commits/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "conventional-commits/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "cqrs/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "cqrs/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "cqrs/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) Create separate optimized data models: a normalized"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A) Create separate optimized data models: a normalized"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C) Create separate optimized data models: a normalized"
- }
- ]
- },
- {
- "label": "cynefin-framework/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) Five domains; : best practices apply,"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C) Five domains; : best practices apply,"
- }
- ]
- },
- {
- "label": "cynefin-framework/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "cynefin-framework/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "definition-of-done/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "definition-of-done/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "definition-of-done/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) Collaboratively create a single, team-wide"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "devils-advocate/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A) Present opposing viewpoints even if not personally held"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "devils-advocate/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "devils-advocate/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A) Systematically argue against your own design by"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "diataxis-framework/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D) Four documentation types; : learning-oriented,"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "diataxis-framework/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) Develop four distinct documentation sections: beginner tutorials"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "diataxis-framework/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) Develop four distinct documentation sections: beginner tutorials"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D) Develop four distinct documentation sections: beginner tutorials"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "docs-as-code/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "docs-as-code/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "docs-as-code/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "domain-driven-design/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "domain-driven-design/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "domain-driven-design/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) Establish a ubiquitous language by working closely with"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "ears-requirements/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "ears-requirements/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) Structure requirements using specific templates: 'The"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D) Structure requirements using specific templates: 'The"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "ears-requirements/application-paraphrase",
- "score": 0.75,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) Structure requirements using specific templates: 'The"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "B",
- "correct": false,
- "raw_response": "B) Create a comprehensive requirements specification document with functional"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C) Structure requirements using specific templates: 'The"
- }
- ]
- },
- {
- "label": "event-driven-architecture/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "event-driven-architecture/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "event-driven-architecture/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "fagan-inspection/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "fagan-inspection/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "fagan-inspection/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "feynman-technique/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "feynman-technique/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "feynman-technique/application-paraphrase",
- "score": 0.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "C",
- "correct": false,
- "raw_response": "C)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "B",
- "correct": false,
- "raw_response": "B"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "A",
- "correct": false,
- "raw_response": "A"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "A",
- "correct": false,
- "raw_response": "A"
- }
- ]
- },
- {
- "label": "five-whys/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "five-whys/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "five-whys/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "fowler-patterns/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "fowler-patterns/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "fowler-patterns/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) Use a Domain Model pattern for complex business"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D) Use a Domain Model pattern for complex business"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C) Use a Domain Model pattern for complex business"
- }
- ]
- },
- {
- "label": "gherkin/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- }
- ]
- },
- {
- "label": "gherkin/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "gherkin/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "github-flow/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) Workflow steps"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A) Workflow steps"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D) Workflow steps"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "github-flow/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "github-flow/application-paraphrase",
- "score": 0.75,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "A",
- "correct": false,
- "raw_response": "A"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "gutes-deutsch-wolf-schneider/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "gutes-deutsch-wolf-schneider/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "gutes-deutsch-wolf-schneider/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "hexagonal-architecture/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "hexagonal-architecture/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "hexagonal-architecture/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "iec-61508-sil-levels/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "iec-61508-sil-levels/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "iec-61508-sil-levels/application-paraphrase",
- "score": 0.75,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "D",
- "correct": false,
- "raw_response": "D"
- }
- ]
- },
- {
- "label": "impact-mapping/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D) Goal → Actors → Impacts → Deliver"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "impact-mapping/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) Map the retention goal to key actors ("
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C) Map the retention goal to key actors ("
- }
- ]
- },
- {
- "label": "impact-mapping/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) Map the retention goal to key actors ("
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "invest/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "invest/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "invest/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A) Split this into multiple smaller stories with specific"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D) Split this into multiple smaller stories with specific"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "iso-25010/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "iso-25010/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "iso-25010/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "jobs-to-be-done/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "jobs-to-be-done/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) Interview users about the specific circumstances that led"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "jobs-to-be-done/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) Interview users about the specific circumstances that led"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A) Interview users about the specific circumstances that led"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D) Interview users about the specific circumstances that led"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C) Interview users about the specific circumstances that led"
- }
- ]
- },
- {
- "label": "lasr/recognition",
- "score": 0.75,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "A",
- "correct": false,
- "raw_response": "A"
- }
- ]
- },
- {
- "label": "lasr/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "lasr/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "linddun/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "linddun/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "linddun/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D) Systematically analyze the system against seven privacy"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C) Systematically analyze the system against seven privacy"
- }
- ]
- },
- {
- "label": "llm-evaluations/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "llm-evaluations/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "llm-evaluations/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "madr/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "madr/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A) Document the decision with sections for context,"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D) Document the decision with sections for context,"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "madr/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D) Document the decision with sections for context,"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "mece/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D) Structuring categories so they do not overlap"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "mece/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D) Organize by business capability: User Service"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C) Organize by business capability: User Service"
- }
- ]
- },
- {
- "label": "mece/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D) Organize by business capability: User Service"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C) Organize by business capability: User Service"
- }
- ]
- },
- {
- "label": "morphological-box/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) Break complex problem into independent parameters/dim"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D) Break complex problem into independent parameters/dim"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C) Break complex problem into independent parameters/dim"
- }
- ]
- },
- {
- "label": "morphological-box/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "morphological-box/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "moscow/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "moscow/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "moscow/application-paraphrase",
- "score": 0.75,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "C",
- "correct": false,
- "raw_response": "C"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "mutation-testing/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "mutation-testing/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "mutation-testing/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "negative-control/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- }
- ]
- },
- {
- "label": "nelson-rules/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "nelson-rules/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "nelson-rules/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "owasp-top-10/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- }
- ]
- },
- {
- "label": "owasp-top-10/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "owasp-top-10/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "plain-english-strunk-white/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "plain-english-strunk-white/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "plain-english-strunk-white/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "prd/recognition",
- "score": 0.75,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "A",
- "correct": false,
- "raw_response": "A"
- }
- ]
- },
- {
- "label": "prd/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "prd/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) Write a comprehensive document that defines the problem"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D) Write a comprehensive document that defines the problem"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "problem-space-nvc/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) Concrete, objective facts without evaluation or judgment"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "problem-space-nvc/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "problem-space-nvc/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "property-based-testing/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D) Invariants that should always hold; automatic"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "property-based-testing/application-anchor",
- "score": 0.75,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) Define mathematical invariants like 'interest calculations"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "A",
- "correct": false,
- "raw_response": "A"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "property-based-testing/application-paraphrase",
- "score": 0.75,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "D",
- "correct": false,
- "raw_response": "D) Write comprehensive unit tests covering typical financial scenarios"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D) Define mathematical invariants like 'interest calculations"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "pyramid-principle/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "pyramid-principle/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "pyramid-principle/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "sanity-check/recognition",
- "score": 0.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "X",
- "answer": "A",
- "correct": false,
- "raw_response": "None of the options provided are correct. The Answer"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "X",
- "answer": "A",
- "correct": false,
- "raw_response": "None of the options provided correspond to the Answer to"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "X",
- "answer": "A",
- "correct": false,
- "raw_response": "None of the options provided correspond to the Answer to"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "X",
- "answer": "A",
- "correct": false,
- "raw_response": "None of the options provided correspond to the Answer to"
- }
- ]
- },
- {
- "label": "semantic-versioning/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "semantic-versioning/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A) 3.0.0 - because"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "semantic-versioning/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "socratic-method/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "socratic-method/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "socratic-method/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "sota/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "sota/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "sota/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "spc/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "spc/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "spc/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "stride/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "None of the options provided accurately describe the STRIDE"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "stride/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "stride/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "swot/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "swot/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C) Analyze internal strengths and weaknesses of your current"
- }
- ]
- },
- {
- "label": "swot/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) Analyze internal strengths and weaknesses of your current"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "tdd-chicago-school/recognition",
- "score": 0.75,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "D",
- "correct": false,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "tdd-chicago-school/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "tdd-chicago-school/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D) Begin with tests for the core pricing calculations"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "tdd-london-school/recognition",
- "score": 0.75,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "D",
- "correct": false,
- "raw_response": "D"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "tdd-london-school/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) Write a test that mocks PaymentGateway and"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "tdd-london-school/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "tdd-london-school/consistency-variant-1",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) Steve Freeman"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A) Steve Freeman"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D) Steve Freeman"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C) Steve Freeman"
- }
- ]
- },
- {
- "label": "tdd-london-school/consistency-variant-2",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) Steve Freeman"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A) Steve Freeman"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D) Steve Freeman"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C) Steve Freeman"
- }
- ]
- },
- {
- "label": "tdd-london-school/consistency-variant-3",
- "score": 0.5,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "C",
- "correct": false,
- "raw_response": "C) Dan North"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "B",
- "correct": false,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D) Steve Freeman"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C) Steve Freeman"
- }
- ]
- },
- {
- "label": "tdd-london-school/consistency-language",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D) Steve Freeman"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "testing-pyramid/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) Three layers; more unit tests, fewer"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A) Three layers; more unit tests, fewer"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "testing-pyramid/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "testing-pyramid/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "timtowtdi/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- }
- ]
- },
- {
- "label": "timtowtdi/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "timtowtdi/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "todotxt-flavoured-markdown/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "todotxt-flavoured-markdown/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "todotxt-flavoured-markdown/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "user-story-mapping/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "user-story-mapping/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "user-story-mapping/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A)"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "wardley-mapping/recognition",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C)"
- }
- ]
- },
- {
- "label": "wardley-mapping/application-anchor",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B)"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- },
- {
- "label": "wardley-mapping/application-paraphrase",
- "score": 1.0,
- "results": [
- {
- "permutation": [
- "A",
- "B",
- "C",
- "D"
- ],
- "expected": "B",
- "answer": "B",
- "correct": true,
- "raw_response": "B) Map the payment processing component's position on"
- },
- {
- "permutation": [
- "B",
- "C",
- "D",
- "A"
- ],
- "expected": "A",
- "answer": "A",
- "correct": true,
- "raw_response": "A"
- },
- {
- "permutation": [
- "C",
- "D",
- "A",
- "B"
- ],
- "expected": "D",
- "answer": "D",
- "correct": true,
- "raw_response": "D)"
- },
- {
- "permutation": [
- "D",
- "A",
- "B",
- "C"
- ],
- "expected": "C",
- "answer": "C",
- "correct": true,
- "raw_response": "C"
- }
- ]
- }
- ]
- },
- "duration_seconds": 938.4
-}
\ No newline at end of file
diff --git a/evaluations/summaries/pilot-20260324-174404.json b/evaluations/summaries/pilot-20260324-174404.json
new file mode 100644
index 0000000..c66b5d9
--- /dev/null
+++ b/evaluations/summaries/pilot-20260324-174404.json
@@ -0,0 +1,792 @@
+{
+ "timestamp": "2026-03-24T17:44:04.891380+00:00",
+ "config": {
+ "models": [
+ "claude"
+ ],
+ "openai_model": null,
+ "mistral_model": null,
+ "deepseek_model": null,
+ "ollama_model": null,
+ "ollama_url": null,
+ "no_think": null,
+ "temperature": 0.0
+ },
+ "models": {
+ "claude": [
+ {
+ "label": "adr-according-to-nygard/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "adr-according-to-nygard/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "adr-according-to-nygard/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/consistency-variant-1",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/consistency-variant-2",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/consistency-variant-3",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/consistency-language",
+ "score": 1.0
+ },
+ {
+ "label": "atam/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "atam/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "atam/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "bdd-given-when-then/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "bdd-given-when-then/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "bdd-given-when-then/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "bem-methodology/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "bem-methodology/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "bem-methodology/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "bluf/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "bluf/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "bluf/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "c4-diagrams/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "c4-diagrams/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "c4-diagrams/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "chain-of-thought/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "chain-of-thought/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "chain-of-thought/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "clean-architecture/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "clean-architecture/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "clean-architecture/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "control-chart-shewhart/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "control-chart-shewhart/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "control-chart-shewhart/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "conventional-commits/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "conventional-commits/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "conventional-commits/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "cqrs/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "cqrs/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "cqrs/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "cynefin-framework/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "cynefin-framework/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "cynefin-framework/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "definition-of-done/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "definition-of-done/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "definition-of-done/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "devils-advocate/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "devils-advocate/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "devils-advocate/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "diataxis-framework/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "diataxis-framework/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "diataxis-framework/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "docs-as-code/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "docs-as-code/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "docs-as-code/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "domain-driven-design/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "domain-driven-design/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "domain-driven-design/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "ears-requirements/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "ears-requirements/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "ears-requirements/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "event-driven-architecture/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "event-driven-architecture/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "event-driven-architecture/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "fagan-inspection/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "fagan-inspection/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "fagan-inspection/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "feynman-technique/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "feynman-technique/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "feynman-technique/application-paraphrase",
+ "score": 0.0
+ },
+ {
+ "label": "five-whys/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "five-whys/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "five-whys/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "fowler-patterns/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "fowler-patterns/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "fowler-patterns/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "gherkin/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "gherkin/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "gherkin/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "github-flow/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "github-flow/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "github-flow/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "gutes-deutsch-wolf-schneider/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "gutes-deutsch-wolf-schneider/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "gutes-deutsch-wolf-schneider/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "hexagonal-architecture/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "hexagonal-architecture/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "hexagonal-architecture/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "iec-61508-sil-levels/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "iec-61508-sil-levels/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "iec-61508-sil-levels/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "impact-mapping/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "impact-mapping/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "impact-mapping/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "invest/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "invest/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "invest/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "iso-25010/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "iso-25010/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "iso-25010/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "jobs-to-be-done/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "jobs-to-be-done/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "jobs-to-be-done/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "lasr/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "lasr/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "lasr/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "linddun/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "linddun/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "linddun/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "llm-evaluations/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "llm-evaluations/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "llm-evaluations/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "madr/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "madr/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "madr/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "mece/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "mece/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "mece/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "morphological-box/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "morphological-box/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "morphological-box/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "moscow/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "moscow/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "moscow/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "mutation-testing/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "mutation-testing/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "mutation-testing/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "negative-control/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "nelson-rules/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "nelson-rules/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "nelson-rules/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "owasp-top-10/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "owasp-top-10/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "owasp-top-10/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "plain-english-strunk-white/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "plain-english-strunk-white/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "plain-english-strunk-white/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "prd/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "prd/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "prd/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "problem-space-nvc/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "problem-space-nvc/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "problem-space-nvc/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "property-based-testing/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "property-based-testing/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "property-based-testing/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "pyramid-principle/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "pyramid-principle/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "pyramid-principle/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "sanity-check/recognition",
+ "score": 0.0
+ },
+ {
+ "label": "semantic-versioning/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "semantic-versioning/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "semantic-versioning/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "socratic-method/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "socratic-method/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "socratic-method/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "sota/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "sota/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "sota/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "spc/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "spc/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "spc/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "stride/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "stride/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "stride/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "swot/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "swot/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "swot/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-chicago-school/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-chicago-school/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-chicago-school/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-london-school/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-london-school/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-london-school/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-london-school/consistency-variant-1",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-london-school/consistency-variant-2",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-london-school/consistency-variant-3",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-london-school/consistency-language",
+ "score": 1.0
+ },
+ {
+ "label": "testing-pyramid/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "testing-pyramid/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "testing-pyramid/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "timtowtdi/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "timtowtdi/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "timtowtdi/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "todotxt-flavoured-markdown/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "todotxt-flavoured-markdown/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "todotxt-flavoured-markdown/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "user-story-mapping/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "user-story-mapping/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "user-story-mapping/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "wardley-mapping/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "wardley-mapping/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "wardley-mapping/application-paraphrase",
+ "score": 1.0
+ }
+ ]
+ },
+ "duration_seconds": 4862.1
+}
\ No newline at end of file
diff --git a/evaluations/summaries/pilot-20260324-190600.json b/evaluations/summaries/pilot-20260324-190600.json
new file mode 100644
index 0000000..075b200
--- /dev/null
+++ b/evaluations/summaries/pilot-20260324-190600.json
@@ -0,0 +1,792 @@
+{
+ "timestamp": "2026-03-24T19:06:00.394684+00:00",
+ "config": {
+ "models": [
+ "mistral"
+ ],
+ "openai_model": null,
+ "mistral_model": "mistral-large-2512",
+ "deepseek_model": null,
+ "ollama_model": null,
+ "ollama_url": null,
+ "no_think": null,
+ "temperature": 0.0
+ },
+ "models": {
+ "mistral": [
+ {
+ "label": "adr-according-to-nygard/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "adr-according-to-nygard/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "adr-according-to-nygard/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/consistency-variant-1",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/consistency-variant-2",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/consistency-variant-3",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/consistency-language",
+ "score": 1.0
+ },
+ {
+ "label": "atam/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "atam/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "atam/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "bdd-given-when-then/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "bdd-given-when-then/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "bdd-given-when-then/application-paraphrase",
+ "score": 0.5
+ },
+ {
+ "label": "bem-methodology/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "bem-methodology/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "bem-methodology/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "bluf/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "bluf/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "bluf/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "c4-diagrams/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "c4-diagrams/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "c4-diagrams/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "chain-of-thought/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "chain-of-thought/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "chain-of-thought/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "clean-architecture/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "clean-architecture/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "clean-architecture/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "control-chart-shewhart/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "control-chart-shewhart/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "control-chart-shewhart/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "conventional-commits/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "conventional-commits/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "conventional-commits/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "cqrs/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "cqrs/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "cqrs/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "cynefin-framework/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "cynefin-framework/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "cynefin-framework/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "definition-of-done/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "definition-of-done/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "definition-of-done/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "devils-advocate/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "devils-advocate/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "devils-advocate/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "diataxis-framework/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "diataxis-framework/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "diataxis-framework/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "docs-as-code/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "docs-as-code/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "docs-as-code/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "domain-driven-design/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "domain-driven-design/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "domain-driven-design/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "ears-requirements/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "ears-requirements/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "ears-requirements/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "event-driven-architecture/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "event-driven-architecture/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "event-driven-architecture/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "fagan-inspection/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "fagan-inspection/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "fagan-inspection/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "feynman-technique/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "feynman-technique/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "feynman-technique/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "five-whys/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "five-whys/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "five-whys/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "fowler-patterns/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "fowler-patterns/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "fowler-patterns/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "gherkin/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "gherkin/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "gherkin/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "github-flow/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "github-flow/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "github-flow/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "gutes-deutsch-wolf-schneider/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "gutes-deutsch-wolf-schneider/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "gutes-deutsch-wolf-schneider/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "hexagonal-architecture/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "hexagonal-architecture/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "hexagonal-architecture/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "iec-61508-sil-levels/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "iec-61508-sil-levels/application-anchor",
+ "score": 0.5
+ },
+ {
+ "label": "iec-61508-sil-levels/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "impact-mapping/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "impact-mapping/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "impact-mapping/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "invest/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "invest/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "invest/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "iso-25010/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "iso-25010/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "iso-25010/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "jobs-to-be-done/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "jobs-to-be-done/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "jobs-to-be-done/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "lasr/recognition",
+ "score": 0.25
+ },
+ {
+ "label": "lasr/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "lasr/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "linddun/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "linddun/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "linddun/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "llm-evaluations/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "llm-evaluations/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "llm-evaluations/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "madr/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "madr/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "madr/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "mece/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "mece/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "mece/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "morphological-box/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "morphological-box/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "morphological-box/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "moscow/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "moscow/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "moscow/application-paraphrase",
+ "score": 0.25
+ },
+ {
+ "label": "mutation-testing/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "mutation-testing/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "mutation-testing/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "negative-control/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "nelson-rules/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "nelson-rules/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "nelson-rules/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "owasp-top-10/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "owasp-top-10/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "owasp-top-10/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "plain-english-strunk-white/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "plain-english-strunk-white/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "plain-english-strunk-white/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "prd/recognition",
+ "score": 0.0
+ },
+ {
+ "label": "prd/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "prd/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "problem-space-nvc/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "problem-space-nvc/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "problem-space-nvc/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "property-based-testing/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "property-based-testing/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "property-based-testing/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "pyramid-principle/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "pyramid-principle/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "pyramid-principle/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "sanity-check/recognition",
+ "score": 0.0
+ },
+ {
+ "label": "semantic-versioning/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "semantic-versioning/application-anchor",
+ "score": 0.5
+ },
+ {
+ "label": "semantic-versioning/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "socratic-method/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "socratic-method/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "socratic-method/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "sota/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "sota/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "sota/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "spc/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "spc/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "spc/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "stride/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "stride/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "stride/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "swot/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "swot/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "swot/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-chicago-school/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-chicago-school/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-chicago-school/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-london-school/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-london-school/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-london-school/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-london-school/consistency-variant-1",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-london-school/consistency-variant-2",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-london-school/consistency-variant-3",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-london-school/consistency-language",
+ "score": 1.0
+ },
+ {
+ "label": "testing-pyramid/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "testing-pyramid/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "testing-pyramid/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "timtowtdi/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "timtowtdi/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "timtowtdi/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "todotxt-flavoured-markdown/recognition",
+ "score": 0.5
+ },
+ {
+ "label": "todotxt-flavoured-markdown/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "todotxt-flavoured-markdown/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "user-story-mapping/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "user-story-mapping/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "user-story-mapping/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "wardley-mapping/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "wardley-mapping/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "wardley-mapping/application-paraphrase",
+ "score": 1.0
+ }
+ ]
+ },
+ "duration_seconds": 1018.4
+}
\ No newline at end of file
diff --git a/evaluations/summaries/pilot-20260324-192413.json b/evaluations/summaries/pilot-20260324-192413.json
new file mode 100644
index 0000000..5312994
--- /dev/null
+++ b/evaluations/summaries/pilot-20260324-192413.json
@@ -0,0 +1,792 @@
+{
+ "timestamp": "2026-03-24T19:24:13.551875+00:00",
+ "config": {
+ "models": [
+ "openai"
+ ],
+ "openai_model": "gpt-4o",
+ "mistral_model": null,
+ "deepseek_model": null,
+ "ollama_model": null,
+ "ollama_url": null,
+ "no_think": null,
+ "temperature": 0.0
+ },
+ "models": {
+ "openai": [
+ {
+ "label": "adr-according-to-nygard/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "adr-according-to-nygard/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "adr-according-to-nygard/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/consistency-variant-1",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/consistency-variant-2",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/consistency-variant-3",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/consistency-language",
+ "score": 1.0
+ },
+ {
+ "label": "atam/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "atam/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "atam/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "bdd-given-when-then/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "bdd-given-when-then/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "bdd-given-when-then/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "bem-methodology/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "bem-methodology/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "bem-methodology/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "bluf/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "bluf/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "bluf/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "c4-diagrams/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "c4-diagrams/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "c4-diagrams/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "chain-of-thought/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "chain-of-thought/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "chain-of-thought/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "clean-architecture/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "clean-architecture/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "clean-architecture/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "control-chart-shewhart/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "control-chart-shewhart/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "control-chart-shewhart/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "conventional-commits/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "conventional-commits/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "conventional-commits/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "cqrs/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "cqrs/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "cqrs/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "cynefin-framework/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "cynefin-framework/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "cynefin-framework/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "definition-of-done/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "definition-of-done/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "definition-of-done/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "devils-advocate/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "devils-advocate/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "devils-advocate/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "diataxis-framework/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "diataxis-framework/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "diataxis-framework/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "docs-as-code/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "docs-as-code/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "docs-as-code/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "domain-driven-design/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "domain-driven-design/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "domain-driven-design/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "ears-requirements/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "ears-requirements/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "ears-requirements/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "event-driven-architecture/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "event-driven-architecture/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "event-driven-architecture/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "fagan-inspection/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "fagan-inspection/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "fagan-inspection/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "feynman-technique/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "feynman-technique/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "feynman-technique/application-paraphrase",
+ "score": 0.0
+ },
+ {
+ "label": "five-whys/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "five-whys/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "five-whys/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "fowler-patterns/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "fowler-patterns/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "fowler-patterns/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "gherkin/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "gherkin/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "gherkin/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "github-flow/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "github-flow/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "github-flow/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "gutes-deutsch-wolf-schneider/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "gutes-deutsch-wolf-schneider/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "gutes-deutsch-wolf-schneider/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "hexagonal-architecture/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "hexagonal-architecture/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "hexagonal-architecture/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "iec-61508-sil-levels/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "iec-61508-sil-levels/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "iec-61508-sil-levels/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "impact-mapping/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "impact-mapping/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "impact-mapping/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "invest/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "invest/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "invest/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "iso-25010/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "iso-25010/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "iso-25010/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "jobs-to-be-done/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "jobs-to-be-done/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "jobs-to-be-done/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "lasr/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "lasr/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "lasr/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "linddun/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "linddun/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "linddun/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "llm-evaluations/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "llm-evaluations/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "llm-evaluations/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "madr/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "madr/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "madr/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "mece/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "mece/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "mece/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "morphological-box/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "morphological-box/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "morphological-box/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "moscow/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "moscow/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "moscow/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "mutation-testing/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "mutation-testing/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "mutation-testing/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "negative-control/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "nelson-rules/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "nelson-rules/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "nelson-rules/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "owasp-top-10/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "owasp-top-10/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "owasp-top-10/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "plain-english-strunk-white/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "plain-english-strunk-white/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "plain-english-strunk-white/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "prd/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "prd/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "prd/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "problem-space-nvc/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "problem-space-nvc/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "problem-space-nvc/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "property-based-testing/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "property-based-testing/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "property-based-testing/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "pyramid-principle/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "pyramid-principle/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "pyramid-principle/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "sanity-check/recognition",
+ "score": 0.0
+ },
+ {
+ "label": "semantic-versioning/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "semantic-versioning/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "semantic-versioning/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "socratic-method/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "socratic-method/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "socratic-method/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "sota/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "sota/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "sota/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "spc/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "spc/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "spc/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "stride/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "stride/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "stride/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "swot/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "swot/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "swot/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-chicago-school/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "tdd-chicago-school/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-chicago-school/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-london-school/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "tdd-london-school/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-london-school/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-london-school/consistency-variant-1",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-london-school/consistency-variant-2",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-london-school/consistency-variant-3",
+ "score": 0.5
+ },
+ {
+ "label": "tdd-london-school/consistency-language",
+ "score": 1.0
+ },
+ {
+ "label": "testing-pyramid/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "testing-pyramid/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "testing-pyramid/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "timtowtdi/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "timtowtdi/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "timtowtdi/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "todotxt-flavoured-markdown/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "todotxt-flavoured-markdown/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "todotxt-flavoured-markdown/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "user-story-mapping/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "user-story-mapping/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "user-story-mapping/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "wardley-mapping/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "wardley-mapping/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "wardley-mapping/application-paraphrase",
+ "score": 1.0
+ }
+ ]
+ },
+ "duration_seconds": 938.4
+}
\ No newline at end of file
diff --git a/evaluations/summaries/pilot-20260326-070127_mistral-medium-2508.json b/evaluations/summaries/pilot-20260326-070127_mistral-medium-2508.json
new file mode 100644
index 0000000..3b26758
--- /dev/null
+++ b/evaluations/summaries/pilot-20260326-070127_mistral-medium-2508.json
@@ -0,0 +1,792 @@
+{
+ "timestamp": "2026-03-26T07:01:27.544539+00:00",
+ "config": {
+ "models": [
+ "mistral"
+ ],
+ "openai_model": null,
+ "mistral_model": "mistral-medium-2508",
+ "deepseek_model": null,
+ "ollama_model": null,
+ "ollama_url": null,
+ "no_think": null,
+ "temperature": 0.0
+ },
+ "models": {
+ "mistral": [
+ {
+ "label": "adr-according-to-nygard/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "adr-according-to-nygard/application-anchor",
+ "score": 0.5
+ },
+ {
+ "label": "adr-according-to-nygard/application-paraphrase",
+ "score": 0.25
+ },
+ {
+ "label": "arc42/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "arc42/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "arc42/application-paraphrase",
+ "score": 0.5
+ },
+ {
+ "label": "arc42/consistency-variant-1",
+ "score": 0.25
+ },
+ {
+ "label": "arc42/consistency-variant-2",
+ "score": 0.5
+ },
+ {
+ "label": "arc42/consistency-variant-3",
+ "score": 0.5
+ },
+ {
+ "label": "arc42/consistency-language",
+ "score": 0.0
+ },
+ {
+ "label": "atam/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "atam/application-anchor",
+ "score": 0.5
+ },
+ {
+ "label": "atam/application-paraphrase",
+ "score": 0.5
+ },
+ {
+ "label": "bdd-given-when-then/recognition",
+ "score": 0.25
+ },
+ {
+ "label": "bdd-given-when-then/application-anchor",
+ "score": 0.5
+ },
+ {
+ "label": "bdd-given-when-then/application-paraphrase",
+ "score": 0.25
+ },
+ {
+ "label": "bem-methodology/recognition",
+ "score": 0.5
+ },
+ {
+ "label": "bem-methodology/application-anchor",
+ "score": 0.5
+ },
+ {
+ "label": "bem-methodology/application-paraphrase",
+ "score": 0.25
+ },
+ {
+ "label": "bluf/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "bluf/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "bluf/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "c4-diagrams/recognition",
+ "score": 0.25
+ },
+ {
+ "label": "c4-diagrams/application-anchor",
+ "score": 0.5
+ },
+ {
+ "label": "c4-diagrams/application-paraphrase",
+ "score": 0.0
+ },
+ {
+ "label": "chain-of-thought/recognition",
+ "score": 0.5
+ },
+ {
+ "label": "chain-of-thought/application-anchor",
+ "score": 0.5
+ },
+ {
+ "label": "chain-of-thought/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "clean-architecture/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "clean-architecture/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "clean-architecture/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "control-chart-shewhart/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "control-chart-shewhart/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "control-chart-shewhart/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "conventional-commits/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "conventional-commits/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "conventional-commits/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "cqrs/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "cqrs/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "cqrs/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "cynefin-framework/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "cynefin-framework/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "cynefin-framework/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "definition-of-done/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "definition-of-done/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "definition-of-done/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "devils-advocate/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "devils-advocate/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "devils-advocate/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "diataxis-framework/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "diataxis-framework/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "diataxis-framework/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "docs-as-code/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "docs-as-code/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "docs-as-code/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "domain-driven-design/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "domain-driven-design/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "domain-driven-design/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "ears-requirements/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "ears-requirements/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "ears-requirements/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "event-driven-architecture/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "event-driven-architecture/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "event-driven-architecture/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "fagan-inspection/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "fagan-inspection/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "fagan-inspection/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "feynman-technique/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "feynman-technique/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "feynman-technique/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "five-whys/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "five-whys/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "five-whys/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "fowler-patterns/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "fowler-patterns/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "fowler-patterns/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "gherkin/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "gherkin/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "gherkin/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "github-flow/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "github-flow/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "github-flow/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "gutes-deutsch-wolf-schneider/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "gutes-deutsch-wolf-schneider/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "gutes-deutsch-wolf-schneider/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "hexagonal-architecture/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "hexagonal-architecture/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "hexagonal-architecture/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "iec-61508-sil-levels/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "iec-61508-sil-levels/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "iec-61508-sil-levels/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "impact-mapping/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "impact-mapping/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "impact-mapping/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "invest/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "invest/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "invest/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "iso-25010/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "iso-25010/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "iso-25010/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "jobs-to-be-done/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "jobs-to-be-done/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "jobs-to-be-done/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "lasr/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "lasr/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "lasr/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "linddun/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "linddun/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "linddun/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "llm-evaluations/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "llm-evaluations/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "llm-evaluations/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "madr/recognition",
+ "score": 0.25
+ },
+ {
+ "label": "madr/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "madr/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "mece/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "mece/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "mece/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "morphological-box/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "morphological-box/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "morphological-box/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "moscow/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "moscow/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "moscow/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "mutation-testing/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "mutation-testing/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "mutation-testing/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "negative-control/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "nelson-rules/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "nelson-rules/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "nelson-rules/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "owasp-top-10/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "owasp-top-10/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "owasp-top-10/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "plain-english-strunk-white/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "plain-english-strunk-white/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "plain-english-strunk-white/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "prd/recognition",
+ "score": 0.5
+ },
+ {
+ "label": "prd/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "prd/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "problem-space-nvc/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "problem-space-nvc/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "problem-space-nvc/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "property-based-testing/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "property-based-testing/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "property-based-testing/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "pyramid-principle/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "pyramid-principle/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "pyramid-principle/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "sanity-check/recognition",
+ "score": 0.0
+ },
+ {
+ "label": "semantic-versioning/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "semantic-versioning/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "semantic-versioning/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "socratic-method/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "socratic-method/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "socratic-method/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "sota/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "sota/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "sota/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "spc/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "spc/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "spc/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "stride/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "stride/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "stride/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "swot/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "swot/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "swot/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "tdd-chicago-school/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-chicago-school/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-chicago-school/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-london-school/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "tdd-london-school/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-london-school/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "tdd-london-school/consistency-variant-1",
+ "score": 0.75
+ },
+ {
+ "label": "tdd-london-school/consistency-variant-2",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-london-school/consistency-variant-3",
+ "score": 0.0
+ },
+ {
+ "label": "tdd-london-school/consistency-language",
+ "score": 0.75
+ },
+ {
+ "label": "testing-pyramid/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "testing-pyramid/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "testing-pyramid/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "timtowtdi/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "timtowtdi/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "timtowtdi/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "todotxt-flavoured-markdown/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "todotxt-flavoured-markdown/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "todotxt-flavoured-markdown/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "user-story-mapping/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "user-story-mapping/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "user-story-mapping/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "wardley-mapping/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "wardley-mapping/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "wardley-mapping/application-paraphrase",
+ "score": 1.0
+ }
+ ]
+ },
+ "duration_seconds": 1872.8
+}
\ No newline at end of file
diff --git a/evaluations/summaries/pilot-20260326-073241_devstral-2512.json b/evaluations/summaries/pilot-20260326-073241_devstral-2512.json
new file mode 100644
index 0000000..e22ba86
--- /dev/null
+++ b/evaluations/summaries/pilot-20260326-073241_devstral-2512.json
@@ -0,0 +1,792 @@
+{
+ "timestamp": "2026-03-26T07:32:41.051735+00:00",
+ "config": {
+ "models": [
+ "mistral"
+ ],
+ "openai_model": null,
+ "mistral_model": "devstral-2512",
+ "deepseek_model": null,
+ "ollama_model": null,
+ "ollama_url": null,
+ "no_think": null,
+ "temperature": 0.0
+ },
+ "models": {
+ "mistral": [
+ {
+ "label": "adr-according-to-nygard/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "adr-according-to-nygard/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "adr-according-to-nygard/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/consistency-variant-1",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/consistency-variant-2",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/consistency-variant-3",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/consistency-language",
+ "score": 1.0
+ },
+ {
+ "label": "atam/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "atam/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "atam/application-paraphrase",
+ "score": 0.5
+ },
+ {
+ "label": "bdd-given-when-then/recognition",
+ "score": 0.0
+ },
+ {
+ "label": "bdd-given-when-then/application-anchor",
+ "score": 0.5
+ },
+ {
+ "label": "bdd-given-when-then/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "bem-methodology/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "bem-methodology/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "bem-methodology/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "bluf/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "bluf/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "bluf/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "c4-diagrams/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "c4-diagrams/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "c4-diagrams/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "chain-of-thought/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "chain-of-thought/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "chain-of-thought/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "clean-architecture/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "clean-architecture/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "clean-architecture/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "control-chart-shewhart/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "control-chart-shewhart/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "control-chart-shewhart/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "conventional-commits/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "conventional-commits/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "conventional-commits/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "cqrs/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "cqrs/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "cqrs/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "cynefin-framework/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "cynefin-framework/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "cynefin-framework/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "definition-of-done/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "definition-of-done/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "definition-of-done/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "devils-advocate/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "devils-advocate/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "devils-advocate/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "diataxis-framework/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "diataxis-framework/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "diataxis-framework/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "docs-as-code/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "docs-as-code/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "docs-as-code/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "domain-driven-design/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "domain-driven-design/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "domain-driven-design/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "ears-requirements/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "ears-requirements/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "ears-requirements/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "event-driven-architecture/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "event-driven-architecture/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "event-driven-architecture/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "fagan-inspection/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "fagan-inspection/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "fagan-inspection/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "feynman-technique/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "feynman-technique/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "feynman-technique/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "five-whys/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "five-whys/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "five-whys/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "fowler-patterns/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "fowler-patterns/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "fowler-patterns/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "gherkin/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "gherkin/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "gherkin/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "github-flow/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "github-flow/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "github-flow/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "gutes-deutsch-wolf-schneider/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "gutes-deutsch-wolf-schneider/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "gutes-deutsch-wolf-schneider/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "hexagonal-architecture/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "hexagonal-architecture/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "hexagonal-architecture/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "iec-61508-sil-levels/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "iec-61508-sil-levels/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "iec-61508-sil-levels/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "impact-mapping/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "impact-mapping/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "impact-mapping/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "invest/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "invest/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "invest/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "iso-25010/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "iso-25010/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "iso-25010/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "jobs-to-be-done/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "jobs-to-be-done/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "jobs-to-be-done/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "lasr/recognition",
+ "score": 0.5
+ },
+ {
+ "label": "lasr/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "lasr/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "linddun/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "linddun/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "linddun/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "llm-evaluations/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "llm-evaluations/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "llm-evaluations/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "madr/recognition",
+ "score": 0.5
+ },
+ {
+ "label": "madr/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "madr/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "mece/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "mece/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "mece/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "morphological-box/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "morphological-box/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "morphological-box/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "moscow/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "moscow/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "moscow/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "mutation-testing/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "mutation-testing/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "mutation-testing/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "negative-control/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "nelson-rules/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "nelson-rules/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "nelson-rules/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "owasp-top-10/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "owasp-top-10/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "owasp-top-10/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "plain-english-strunk-white/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "plain-english-strunk-white/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "plain-english-strunk-white/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "prd/recognition",
+ "score": 0.5
+ },
+ {
+ "label": "prd/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "prd/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "problem-space-nvc/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "problem-space-nvc/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "problem-space-nvc/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "property-based-testing/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "property-based-testing/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "property-based-testing/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "pyramid-principle/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "pyramid-principle/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "pyramid-principle/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "sanity-check/recognition",
+ "score": 0.0
+ },
+ {
+ "label": "semantic-versioning/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "semantic-versioning/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "semantic-versioning/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "socratic-method/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "socratic-method/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "socratic-method/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "sota/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "sota/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "sota/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "spc/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "spc/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "spc/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "stride/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "stride/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "stride/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "swot/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "swot/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "swot/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-chicago-school/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "tdd-chicago-school/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-chicago-school/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-london-school/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-london-school/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-london-school/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-london-school/consistency-variant-1",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-london-school/consistency-variant-2",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-london-school/consistency-variant-3",
+ "score": 0.25
+ },
+ {
+ "label": "tdd-london-school/consistency-language",
+ "score": 1.0
+ },
+ {
+ "label": "testing-pyramid/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "testing-pyramid/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "testing-pyramid/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "timtowtdi/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "timtowtdi/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "timtowtdi/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "todotxt-flavoured-markdown/recognition",
+ "score": 0.5
+ },
+ {
+ "label": "todotxt-flavoured-markdown/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "todotxt-flavoured-markdown/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "user-story-mapping/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "user-story-mapping/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "user-story-mapping/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "wardley-mapping/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "wardley-mapping/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "wardley-mapping/application-paraphrase",
+ "score": 1.0
+ }
+ ]
+ },
+ "duration_seconds": 919.6
+}
\ No newline at end of file
diff --git a/evaluations/summaries/pilot-20260326-074132_mistral-small-2603.json b/evaluations/summaries/pilot-20260326-074132_mistral-small-2603.json
new file mode 100644
index 0000000..d7d851e
--- /dev/null
+++ b/evaluations/summaries/pilot-20260326-074132_mistral-small-2603.json
@@ -0,0 +1,792 @@
+{
+ "timestamp": "2026-03-26T07:41:32.011654+00:00",
+ "config": {
+ "models": [
+ "mistral"
+ ],
+ "openai_model": null,
+ "mistral_model": "mistral-small-2603",
+ "deepseek_model": null,
+ "ollama_model": null,
+ "ollama_url": null,
+ "no_think": null,
+ "temperature": 0.0
+ },
+ "models": {
+ "mistral": [
+ {
+ "label": "adr-according-to-nygard/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "adr-according-to-nygard/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "adr-according-to-nygard/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/consistency-variant-1",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/consistency-variant-2",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/consistency-variant-3",
+ "score": 1.0
+ },
+ {
+ "label": "arc42/consistency-language",
+ "score": 1.0
+ },
+ {
+ "label": "atam/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "atam/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "atam/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "bdd-given-when-then/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "bdd-given-when-then/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "bdd-given-when-then/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "bem-methodology/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "bem-methodology/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "bem-methodology/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "bluf/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "bluf/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "bluf/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "c4-diagrams/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "c4-diagrams/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "c4-diagrams/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "chain-of-thought/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "chain-of-thought/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "chain-of-thought/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "clean-architecture/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "clean-architecture/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "clean-architecture/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "control-chart-shewhart/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "control-chart-shewhart/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "control-chart-shewhart/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "conventional-commits/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "conventional-commits/application-anchor",
+ "score": 0.5
+ },
+ {
+ "label": "conventional-commits/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "cqrs/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "cqrs/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "cqrs/application-paraphrase",
+ "score": 0.5
+ },
+ {
+ "label": "cynefin-framework/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "cynefin-framework/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "cynefin-framework/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "definition-of-done/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "definition-of-done/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "definition-of-done/application-paraphrase",
+ "score": 0.5
+ },
+ {
+ "label": "devils-advocate/recognition",
+ "score": 0.5
+ },
+ {
+ "label": "devils-advocate/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "devils-advocate/application-paraphrase",
+ "score": 0.5
+ },
+ {
+ "label": "diataxis-framework/recognition",
+ "score": 0.25
+ },
+ {
+ "label": "diataxis-framework/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "diataxis-framework/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "docs-as-code/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "docs-as-code/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "docs-as-code/application-paraphrase",
+ "score": 0.5
+ },
+ {
+ "label": "domain-driven-design/recognition",
+ "score": 0.5
+ },
+ {
+ "label": "domain-driven-design/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "domain-driven-design/application-paraphrase",
+ "score": 0.5
+ },
+ {
+ "label": "ears-requirements/recognition",
+ "score": 0.0
+ },
+ {
+ "label": "ears-requirements/application-anchor",
+ "score": 0.0
+ },
+ {
+ "label": "ears-requirements/application-paraphrase",
+ "score": 0.5
+ },
+ {
+ "label": "event-driven-architecture/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "event-driven-architecture/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "event-driven-architecture/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "fagan-inspection/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "fagan-inspection/application-anchor",
+ "score": 0.25
+ },
+ {
+ "label": "fagan-inspection/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "feynman-technique/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "feynman-technique/application-anchor",
+ "score": 0.5
+ },
+ {
+ "label": "feynman-technique/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "five-whys/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "five-whys/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "five-whys/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "fowler-patterns/recognition",
+ "score": 0.5
+ },
+ {
+ "label": "fowler-patterns/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "fowler-patterns/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "gherkin/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "gherkin/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "gherkin/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "github-flow/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "github-flow/application-anchor",
+ "score": 0.5
+ },
+ {
+ "label": "github-flow/application-paraphrase",
+ "score": 0.0
+ },
+ {
+ "label": "gutes-deutsch-wolf-schneider/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "gutes-deutsch-wolf-schneider/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "gutes-deutsch-wolf-schneider/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "hexagonal-architecture/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "hexagonal-architecture/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "hexagonal-architecture/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "iec-61508-sil-levels/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "iec-61508-sil-levels/application-anchor",
+ "score": 0.0
+ },
+ {
+ "label": "iec-61508-sil-levels/application-paraphrase",
+ "score": 0.0
+ },
+ {
+ "label": "impact-mapping/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "impact-mapping/application-anchor",
+ "score": 0.5
+ },
+ {
+ "label": "impact-mapping/application-paraphrase",
+ "score": 0.25
+ },
+ {
+ "label": "invest/recognition",
+ "score": 0.25
+ },
+ {
+ "label": "invest/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "invest/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "iso-25010/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "iso-25010/application-anchor",
+ "score": 0.5
+ },
+ {
+ "label": "iso-25010/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "jobs-to-be-done/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "jobs-to-be-done/application-anchor",
+ "score": 0.5
+ },
+ {
+ "label": "jobs-to-be-done/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "lasr/recognition",
+ "score": 0.5
+ },
+ {
+ "label": "lasr/application-anchor",
+ "score": 0.5
+ },
+ {
+ "label": "lasr/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "linddun/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "linddun/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "linddun/application-paraphrase",
+ "score": 0.5
+ },
+ {
+ "label": "llm-evaluations/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "llm-evaluations/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "llm-evaluations/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "madr/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "madr/application-anchor",
+ "score": 0.5
+ },
+ {
+ "label": "madr/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "mece/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "mece/application-anchor",
+ "score": 0.5
+ },
+ {
+ "label": "mece/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "morphological-box/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "morphological-box/application-anchor",
+ "score": 0.25
+ },
+ {
+ "label": "morphological-box/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "moscow/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "moscow/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "moscow/application-paraphrase",
+ "score": 0.5
+ },
+ {
+ "label": "mutation-testing/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "mutation-testing/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "mutation-testing/application-paraphrase",
+ "score": 0.25
+ },
+ {
+ "label": "negative-control/recognition",
+ "score": 0.5
+ },
+ {
+ "label": "nelson-rules/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "nelson-rules/application-anchor",
+ "score": 0.5
+ },
+ {
+ "label": "nelson-rules/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "owasp-top-10/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "owasp-top-10/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "owasp-top-10/application-paraphrase",
+ "score": 0.5
+ },
+ {
+ "label": "plain-english-strunk-white/recognition",
+ "score": 0.5
+ },
+ {
+ "label": "plain-english-strunk-white/application-anchor",
+ "score": 0.5
+ },
+ {
+ "label": "plain-english-strunk-white/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "prd/recognition",
+ "score": 0.25
+ },
+ {
+ "label": "prd/application-anchor",
+ "score": 0.5
+ },
+ {
+ "label": "prd/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "problem-space-nvc/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "problem-space-nvc/application-anchor",
+ "score": 0.25
+ },
+ {
+ "label": "problem-space-nvc/application-paraphrase",
+ "score": 0.5
+ },
+ {
+ "label": "property-based-testing/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "property-based-testing/application-anchor",
+ "score": 0.25
+ },
+ {
+ "label": "property-based-testing/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "pyramid-principle/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "pyramid-principle/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "pyramid-principle/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "sanity-check/recognition",
+ "score": 0.0
+ },
+ {
+ "label": "semantic-versioning/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "semantic-versioning/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "semantic-versioning/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "socratic-method/recognition",
+ "score": 0.25
+ },
+ {
+ "label": "socratic-method/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "socratic-method/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "sota/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "sota/application-anchor",
+ "score": 0.5
+ },
+ {
+ "label": "sota/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "spc/recognition",
+ "score": 0.5
+ },
+ {
+ "label": "spc/application-anchor",
+ "score": 0.25
+ },
+ {
+ "label": "spc/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "stride/recognition",
+ "score": 0.5
+ },
+ {
+ "label": "stride/application-anchor",
+ "score": 0.5
+ },
+ {
+ "label": "stride/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "swot/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "swot/application-anchor",
+ "score": 0.25
+ },
+ {
+ "label": "swot/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "tdd-chicago-school/recognition",
+ "score": 0.5
+ },
+ {
+ "label": "tdd-chicago-school/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "tdd-chicago-school/application-paraphrase",
+ "score": 0.5
+ },
+ {
+ "label": "tdd-london-school/recognition",
+ "score": 0.5
+ },
+ {
+ "label": "tdd-london-school/application-anchor",
+ "score": 0.5
+ },
+ {
+ "label": "tdd-london-school/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "tdd-london-school/consistency-variant-1",
+ "score": 0.5
+ },
+ {
+ "label": "tdd-london-school/consistency-variant-2",
+ "score": 0.75
+ },
+ {
+ "label": "tdd-london-school/consistency-variant-3",
+ "score": 0.25
+ },
+ {
+ "label": "tdd-london-school/consistency-language",
+ "score": 0.5
+ },
+ {
+ "label": "testing-pyramid/recognition",
+ "score": 1.0
+ },
+ {
+ "label": "testing-pyramid/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "testing-pyramid/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "timtowtdi/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "timtowtdi/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "timtowtdi/application-paraphrase",
+ "score": 1.0
+ },
+ {
+ "label": "todotxt-flavoured-markdown/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "todotxt-flavoured-markdown/application-anchor",
+ "score": 0.75
+ },
+ {
+ "label": "todotxt-flavoured-markdown/application-paraphrase",
+ "score": 0.75
+ },
+ {
+ "label": "user-story-mapping/recognition",
+ "score": 0.75
+ },
+ {
+ "label": "user-story-mapping/application-anchor",
+ "score": 1.0
+ },
+ {
+ "label": "user-story-mapping/application-paraphrase",
+ "score": 0.5
+ },
+ {
+ "label": "wardley-mapping/recognition",
+ "score": 0.5
+ },
+ {
+ "label": "wardley-mapping/application-anchor",
+ "score": 0.5
+ },
+ {
+ "label": "wardley-mapping/application-paraphrase",
+ "score": 0.25
+ }
+ ]
+ },
+ "duration_seconds": 1089.2
+}
\ No newline at end of file
diff --git a/website/public/evaluation-report.html b/website/public/evaluation-report.html
index 0fd9fb2..584a172 100644
--- a/website/public/evaluation-report.html
+++ b/website/public/evaluation-report.html
@@ -37,7 +37,7 @@
Semantic Anchor Evaluation Report
-
Multiple-choice recognition test across 3 LLMs — 191 questions, 61 anchors
+
Multiple-choice recognition test across 6 LLMs — 191 questions, 61 anchors
@@ -48,298 +48,336 @@
Semantic Anchor Evaluation Report
Model Summary
-
Claude Sonnet
+
claude-sonnet-4-20250514
99%
191 questions · pilot-20260324-174404.json
-
GPT-4o
+
devstral-2512
+
96%
+
191 questions · pilot-20260326-073241_devstral-2512.json
+
+
+
gpt-4o
98%
191 questions · pilot-20260324-192413.json
-
Mistral Large
+
mistral-large-2512
96%
191 questions · pilot-20260324-190600.json
+
+
mistral-medium-2508
+
85%
+
191 questions · pilot-20260326-070127_mistral-medium-2508.json
+
+
+
mistral-small-2603
+
74%
+
191 questions · pilot-20260326-074132_mistral-small-2603.json
+
Heatmap: Anchor × Model
| Anchor / Question |
- Claude Sonnet |
- GPT-4o |
- Mistral Large |
+ claude-sonnet-4-20250514 |
+ devstral-2512 |
+ gpt-4o |
+ mistral-large-2512 |
+ mistral-medium-2508 |
+ mistral-small-2603 |
-| adr-according-to-nygard | ✓ | ✓ | 92% |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | 75% |
-| arc42 | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| consistency-language | ✓ | ✓ | ✓ |
-| consistency-variant-1 | ✓ | ✓ | ✓ |
-| consistency-variant-2 | ✓ | ✓ | ✓ |
-| consistency-variant-3 | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| atam | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| bdd-given-when-then | ✓ | ✓ | 83% |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | 50% |
-| recognition | ✓ | ✓ | ✓ |
-| bem-methodology | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| bluf | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| c4-diagrams | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| chain-of-thought | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| clean-architecture | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| control-chart-shewhart | ✓ | 92% | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | 75% | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| conventional-commits | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| cqrs | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| cynefin-framework | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| definition-of-done | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| devils-advocate | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| diataxis-framework | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| docs-as-code | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| domain-driven-design | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| ears-requirements | ✓ | 92% | 83% |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | 75% | 75% |
-| recognition | ✓ | ✓ | 75% |
-| event-driven-architecture | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| fagan-inspection | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| feynman-technique | 67% | 67% | 92% |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | 0% | 0% | 75% |
-| recognition | ✓ | ✓ | ✓ |
-| five-whys | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| fowler-patterns | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| gherkin | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| github-flow | 92% | 92% | 92% |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | 75% | 75% | 75% |
-| recognition | ✓ | ✓ | ✓ |
-| gutes-deutsch-wolf-schneider | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| hexagonal-architecture | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| iec-61508-sil-levels | ✓ | 92% | 83% |
-| application-anchor | ✓ | ✓ | 50% |
-| application-paraphrase | ✓ | 75% | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| impact-mapping | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| invest | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| iso-25010 | ✓ | ✓ | 83% |
-| application-anchor | ✓ | ✓ | 75% |
-| application-paraphrase | ✓ | ✓ | 75% |
-| recognition | ✓ | ✓ | ✓ |
-| jobs-to-be-done | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| lasr | ✓ | 92% | 75% |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | 75% | 25% |
-| linddun | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| llm-evaluations | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| madr | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| mece | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| morphological-box | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| moscow | ✓ | 92% | 75% |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | 75% | 25% |
-| recognition | ✓ | ✓ | ✓ |
-| mutation-testing | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| nelson-rules | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| owasp-top-10 | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| plain-english-strunk-white | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| prd | ✓ | 92% | 67% |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | 75% | 0% |
-| problem-space-nvc | ✓ | ✓ | 83% |
-| application-anchor | ✓ | ✓ | 75% |
-| application-paraphrase | ✓ | ✓ | 75% |
-| recognition | ✓ | ✓ | ✓ |
-| property-based-testing | ✓ | 83% | ✓ |
-| application-anchor | ✓ | 75% | ✓ |
-| application-paraphrase | ✓ | 75% | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| pyramid-principle | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| semantic-versioning | ✓ | ✓ | 75% |
-| application-anchor | ✓ | ✓ | 50% |
-| application-paraphrase | ✓ | ✓ | 75% |
-| recognition | ✓ | ✓ | ✓ |
-| socratic-method | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| sota | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| spc | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| stride | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| swot | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| tdd-chicago-school | ✓ | 92% | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | 75% | ✓ |
-| tdd-london-school | ✓ | 89% | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| consistency-language | ✓ | ✓ | ✓ |
-| consistency-variant-1 | ✓ | ✓ | ✓ |
-| consistency-variant-2 | ✓ | ✓ | ✓ |
-| consistency-variant-3 | ✓ | 50% | ✓ |
-| recognition | ✓ | 75% | ✓ |
-| testing-pyramid | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| timtowtdi | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| todotxt-flavoured-markdown | ✓ | ✓ | 83% |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | 50% |
-| user-story-mapping | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
-| wardley-mapping | ✓ | ✓ | ✓ |
-| application-anchor | ✓ | ✓ | ✓ |
-| application-paraphrase | ✓ | ✓ | ✓ |
-| recognition | ✓ | ✓ | ✓ |
+| adr-according-to-nygard | ✓ | ✓ | ✓ | 92% | 50% | 83% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 50% | 75% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 25% | ✓ |
+| recognition | ✓ | ✓ | ✓ | 75% | 75% | 75% |
+| arc42 | ✓ | ✓ | ✓ | ✓ | 46% | ✓ |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 50% | ✓ |
+| consistency-language | ✓ | ✓ | ✓ | ✓ | 0% | ✓ |
+| consistency-variant-1 | ✓ | ✓ | ✓ | ✓ | 25% | ✓ |
+| consistency-variant-2 | ✓ | ✓ | ✓ | ✓ | 50% | ✓ |
+| consistency-variant-3 | ✓ | ✓ | ✓ | ✓ | 50% | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| atam | ✓ | 83% | ✓ | ✓ | 58% | 92% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 50% | 75% |
+| application-paraphrase | ✓ | 50% | ✓ | ✓ | 50% | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| bdd-given-when-then | ✓ | 50% | ✓ | 83% | 33% | 92% |
+| application-anchor | ✓ | 50% | ✓ | ✓ | 50% | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | 50% | 25% | 75% |
+| recognition | ✓ | 0% | ✓ | ✓ | 25% | ✓ |
+| bem-methodology | ✓ | ✓ | ✓ | ✓ | 42% | ✓ |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 50% | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 25% | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | 50% | ✓ |
+| bluf | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| c4-diagrams | ✓ | ✓ | ✓ | ✓ | 25% | ✓ |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 50% | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 0% | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | 25% | ✓ |
+| chain-of-thought | ✓ | ✓ | ✓ | ✓ | 58% | ✓ |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 50% | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | 50% | ✓ |
+| clean-architecture | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| control-chart-shewhart | ✓ | ✓ | 92% | ✓ | 92% | ✓ |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| application-paraphrase | ✓ | ✓ | 75% | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| conventional-commits | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| cqrs | ✓ | ✓ | ✓ | ✓ | 92% | 75% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| cynefin-framework | ✓ | ✓ | ✓ | ✓ | 92% | ✓ |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| definition-of-done | ✓ | ✓ | ✓ | ✓ | 83% | 67% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 75% | 50% |
+| recognition | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| devils-advocate | ✓ | ✓ | ✓ | ✓ | ✓ | 58% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| diataxis-framework | ✓ | ✓ | ✓ | ✓ | 92% | 58% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| recognition | ✓ | ✓ | ✓ | ✓ | 75% | 25% |
+| docs-as-code | ✓ | ✓ | ✓ | ✓ | 92% | 83% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 75% | 50% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| domain-driven-design | ✓ | ✓ | ✓ | ✓ | ✓ | 67% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| ears-requirements | ✓ | 83% | 92% | 83% | ✓ | 17% |
+| application-anchor | ✓ | 75% | ✓ | ✓ | ✓ | 0% |
+| application-paraphrase | ✓ | 75% | 75% | 75% | ✓ | 50% |
+| recognition | ✓ | ✓ | ✓ | 75% | ✓ | 0% |
+| event-driven-architecture | ✓ | ✓ | ✓ | ✓ | 83% | 75% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| recognition | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| fagan-inspection | ✓ | ✓ | ✓ | ✓ | 83% | 58% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | 25% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| feynman-technique | 67% | 83% | 67% | 92% | 92% | 75% |
+| application-anchor | ✓ | 75% | ✓ | ✓ | ✓ | 50% |
+| application-paraphrase | 0% | 75% | 0% | 75% | 75% | 75% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| five-whys | ✓ | ✓ | ✓ | ✓ | ✓ | 92% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| fowler-patterns | ✓ | ✓ | ✓ | ✓ | 92% | 75% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| gherkin | ✓ | ✓ | ✓ | ✓ | ✓ | 83% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| github-flow | 92% | 92% | 92% | 92% | ✓ | 42% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| application-paraphrase | 75% | 75% | 75% | 75% | ✓ | 0% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| gutes-deutsch-wolf-schneider | ✓ | ✓ | ✓ | ✓ | 83% | 92% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| hexagonal-architecture | ✓ | ✓ | ✓ | ✓ | 92% | 92% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| iec-61508-sil-levels | ✓ | 83% | 92% | 83% | 92% | 33% |
+| application-anchor | ✓ | 75% | ✓ | 50% | ✓ | 0% |
+| application-paraphrase | ✓ | 75% | 75% | ✓ | ✓ | 0% |
+| recognition | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| impact-mapping | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 25% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| invest | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 25% |
+| iso-25010 | ✓ | ✓ | ✓ | 83% | ✓ | 75% |
+| application-anchor | ✓ | ✓ | ✓ | 75% | ✓ | 50% |
+| application-paraphrase | ✓ | ✓ | ✓ | 75% | ✓ | 75% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| jobs-to-be-done | ✓ | ✓ | ✓ | ✓ | 92% | 83% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| lasr | ✓ | 83% | 92% | 75% | ✓ | 58% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| recognition | ✓ | 50% | 75% | 25% | ✓ | 50% |
+| linddun | ✓ | ✓ | ✓ | ✓ | 92% | 83% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 75% | 50% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| llm-evaluations | ✓ | ✓ | ✓ | ✓ | ✓ | 83% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| madr | ✓ | 83% | ✓ | ✓ | 75% | 67% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| recognition | ✓ | 50% | ✓ | ✓ | 25% | 75% |
+| mece | ✓ | ✓ | ✓ | ✓ | 92% | 83% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | 50% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| morphological-box | ✓ | ✓ | ✓ | ✓ | 92% | 67% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | 25% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| moscow | ✓ | ✓ | 92% | 75% | 92% | 75% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | 75% | 25% | 75% | 50% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| mutation-testing | ✓ | ✓ | ✓ | ✓ | 92% | 58% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 25% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| nelson-rules | ✓ | ✓ | ✓ | ✓ | 92% | 83% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | 50% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| owasp-top-10 | ✓ | ✓ | ✓ | ✓ | 92% | 67% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| plain-english-strunk-white | ✓ | ✓ | ✓ | ✓ | ✓ | 58% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| prd | ✓ | 83% | 92% | 67% | 83% | 58% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| recognition | ✓ | 50% | 75% | 0% | 50% | 25% |
+| problem-space-nvc | ✓ | ✓ | ✓ | 83% | ✓ | 50% |
+| application-anchor | ✓ | ✓ | ✓ | 75% | ✓ | 25% |
+| application-paraphrase | ✓ | ✓ | ✓ | 75% | ✓ | 50% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| property-based-testing | ✓ | 92% | 83% | ✓ | 92% | 67% |
+| application-anchor | ✓ | 75% | 75% | ✓ | ✓ | 25% |
+| application-paraphrase | ✓ | ✓ | 75% | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| pyramid-principle | ✓ | ✓ | ✓ | ✓ | 92% | 83% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| semantic-versioning | ✓ | ✓ | ✓ | 75% | 83% | 92% |
+| application-anchor | ✓ | ✓ | ✓ | 50% | 75% | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | 75% | 75% | 75% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| socratic-method | ✓ | ✓ | ✓ | ✓ | 92% | 67% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 25% |
+| sota | ✓ | ✓ | ✓ | ✓ | 83% | 75% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | 50% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| spc | ✓ | ✓ | ✓ | ✓ | 92% | 50% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | 75% | 25% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| stride | ✓ | ✓ | ✓ | ✓ | 92% | 67% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | 75% | 50% |
+| swot | ✓ | ✓ | ✓ | ✓ | 83% | 67% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 25% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| recognition | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| tdd-chicago-school | ✓ | 92% | 92% | ✓ | ✓ | 67% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| recognition | ✓ | 75% | 75% | ✓ | ✓ | 50% |
+| tdd-london-school | ✓ | 89% | 89% | ✓ | 71% | 54% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| consistency-language | ✓ | ✓ | ✓ | ✓ | 75% | 50% |
+| consistency-variant-1 | ✓ | ✓ | ✓ | ✓ | 75% | 50% |
+| consistency-variant-2 | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| consistency-variant-3 | ✓ | 25% | 50% | ✓ | 0% | 25% |
+| recognition | ✓ | ✓ | 75% | ✓ | 75% | 50% |
+| testing-pyramid | ✓ | ✓ | ✓ | ✓ | 83% | 92% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| recognition | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| timtowtdi | ✓ | ✓ | ✓ | ✓ | 83% | 83% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | 75% | ✓ |
+| recognition | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| todotxt-flavoured-markdown | ✓ | 83% | ✓ | 83% | 92% | 75% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 75% |
+| recognition | ✓ | 50% | ✓ | 50% | 75% | 75% |
+| user-story-mapping | ✓ | ✓ | ✓ | ✓ | 92% | 75% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| recognition | ✓ | ✓ | ✓ | ✓ | 75% | 75% |
+| wardley-mapping | ✓ | ✓ | ✓ | ✓ | ✓ | 42% |
+| application-anchor | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
+| application-paraphrase | ✓ | ✓ | ✓ | ✓ | ✓ | 25% |
+| recognition | ✓ | ✓ | ✓ | ✓ | ✓ | 50% |
Control Questions
-| Control | Claude Sonnet | GPT-4o | Mistral Large |
+| Control | claude-sonnet-4-20250514 | devstral-2512 | gpt-4o | mistral-large-2512 | mistral-medium-2508 | mistral-small-2603 |
-| negative-control | 100% | 100% | 75% |
-| sanity-check | 0% | 0% | 0% |
+| negative-control | 100% | 75% | 100% | 75% | 100% | 50% |
+| sanity-check | 0% | 0% | 0% | 0% | 0% | 0% |
Failures Detail
-
Claude Sonnet: 2 failures
+
claude-sonnet-4-20250514: 2 failures
feynman-technique/application-paraphrase0%
github-flow/application-paraphrase75%
-
GPT-4o: 13 failures
+
devstral-2512: 17 failures
+
+
atam/application-paraphrase50%
+
bdd-given-when-then/application-anchor50%
+
bdd-given-when-then/recognition0%
+
ears-requirements/application-anchor75%
+
ears-requirements/application-paraphrase75%
+
feynman-technique/application-anchor75%
+
feynman-technique/application-paraphrase75%
+
github-flow/application-paraphrase75%
+
iec-61508-sil-levels/application-anchor75%
+
iec-61508-sil-levels/application-paraphrase75%
+
lasr/recognition50%
+
madr/recognition50%
+
prd/recognition50%
+
property-based-testing/application-anchor75%
+
tdd-chicago-school/recognition75%
+
tdd-london-school/consistency-variant-325%
+
todotxt-flavoured-markdown/recognition50%
+
+
gpt-4o: 13 failures
control-chart-shewhart/application-paraphrase75%
ears-requirements/application-paraphrase75%
@@ -355,7 +393,7 @@
GPT-4o: 13 failures
tdd-london-school/consistency-variant-350%
tdd-london-school/recognition75%
-
Mistral Large: 17 failures
+
mistral-large-2512: 17 failures
adr-according-to-nygard/recognition75%
bdd-given-when-then/application-paraphrase50%
@@ -375,11 +413,209 @@
Mistral Large: 17 failures
semantic-versioning/application-paraphrase75%
todotxt-flavoured-markdown/recognition50%
+
mistral-medium-2508: 77 failures
+
+
adr-according-to-nygard/application-anchor50%
+
adr-according-to-nygard/application-paraphrase25%
+
adr-according-to-nygard/recognition75%
+
arc42/application-anchor75%
+
arc42/application-paraphrase50%
+
arc42/consistency-language0%
+
arc42/consistency-variant-125%
+
arc42/consistency-variant-250%
+
arc42/consistency-variant-350%
+
arc42/recognition75%
+
atam/application-anchor50%
+
atam/application-paraphrase50%
+
atam/recognition75%
+
bdd-given-when-then/application-anchor50%
+
bdd-given-when-then/application-paraphrase25%
+
bdd-given-when-then/recognition25%
+
bem-methodology/application-anchor50%
+
bem-methodology/application-paraphrase25%
+
bem-methodology/recognition50%
+
bluf/application-anchor75%
+
bluf/application-paraphrase75%
+
bluf/recognition75%
+
c4-diagrams/application-anchor50%
+
c4-diagrams/application-paraphrase0%
+
c4-diagrams/recognition25%
+
chain-of-thought/application-anchor50%
+
chain-of-thought/application-paraphrase75%
+
chain-of-thought/recognition50%
+
control-chart-shewhart/application-anchor75%
+
cqrs/application-anchor75%
+
cynefin-framework/application-anchor75%
+
definition-of-done/application-paraphrase75%
+
definition-of-done/recognition75%
+
diataxis-framework/recognition75%
+
docs-as-code/application-paraphrase75%
+
event-driven-architecture/application-paraphrase75%
+
event-driven-architecture/recognition75%
+
fagan-inspection/application-anchor75%
+
fagan-inspection/application-paraphrase75%
+
feynman-technique/application-paraphrase75%
+
fowler-patterns/application-anchor75%
+
gutes-deutsch-wolf-schneider/application-anchor75%
+
gutes-deutsch-wolf-schneider/recognition75%
+
hexagonal-architecture/application-paraphrase75%
+
iec-61508-sil-levels/recognition75%
+
jobs-to-be-done/recognition75%
+
linddun/application-paraphrase75%
+
madr/recognition25%
+
mece/application-anchor75%
+
morphological-box/application-anchor75%
+
moscow/application-paraphrase75%
+
mutation-testing/application-anchor75%
+
nelson-rules/application-anchor75%
+
owasp-top-10/application-anchor75%
+
prd/recognition50%
+
property-based-testing/recognition75%
+
pyramid-principle/application-anchor75%
+
semantic-versioning/application-anchor75%
+
semantic-versioning/application-paraphrase75%
+
socratic-method/application-anchor75%
+
sota/application-anchor75%
+
sota/application-paraphrase75%
+
spc/application-anchor75%
+
stride/recognition75%
+
swot/application-paraphrase75%
+
swot/recognition75%
+
tdd-london-school/application-paraphrase75%
+
tdd-london-school/consistency-language75%
+
tdd-london-school/consistency-variant-175%
+
tdd-london-school/consistency-variant-30%
+
tdd-london-school/recognition75%
+
testing-pyramid/application-paraphrase75%
+
testing-pyramid/recognition75%
+
timtowtdi/application-paraphrase75%
+
timtowtdi/recognition75%
+
todotxt-flavoured-markdown/recognition75%
+
user-story-mapping/recognition75%
+
+
mistral-small-2603: 115 failures
+
+
adr-according-to-nygard/application-anchor75%
+
adr-according-to-nygard/recognition75%
+
atam/application-anchor75%
+
bdd-given-when-then/application-paraphrase75%
+
conventional-commits/application-anchor50%
+
conventional-commits/application-paraphrase75%
+
cqrs/application-anchor75%
+
cqrs/application-paraphrase50%
+
definition-of-done/application-anchor75%
+
definition-of-done/application-paraphrase50%
+
definition-of-done/recognition75%
+
devils-advocate/application-anchor75%
+
devils-advocate/application-paraphrase50%
+
devils-advocate/recognition50%
+
diataxis-framework/application-anchor75%
+
diataxis-framework/application-paraphrase75%
+
diataxis-framework/recognition25%
+
docs-as-code/application-paraphrase50%
+
domain-driven-design/application-paraphrase50%
+
domain-driven-design/recognition50%
+
ears-requirements/application-anchor0%
+
ears-requirements/application-paraphrase50%
+
ears-requirements/recognition0%
+
event-driven-architecture/application-anchor75%
+
event-driven-architecture/application-paraphrase75%
+
event-driven-architecture/recognition75%
+
fagan-inspection/application-anchor25%
+
fagan-inspection/application-paraphrase75%
+
fagan-inspection/recognition75%
+
feynman-technique/application-anchor50%
+
feynman-technique/application-paraphrase75%
+
five-whys/application-anchor75%
+
fowler-patterns/application-anchor75%
+
fowler-patterns/recognition50%
+
gherkin/application-anchor75%
+
gherkin/recognition75%
+
github-flow/application-anchor50%
+
github-flow/application-paraphrase0%
+
github-flow/recognition75%
+
gutes-deutsch-wolf-schneider/recognition75%
+
hexagonal-architecture/recognition75%
+
iec-61508-sil-levels/application-anchor0%
+
iec-61508-sil-levels/application-paraphrase0%
+
impact-mapping/application-anchor50%
+
impact-mapping/application-paraphrase25%
+
impact-mapping/recognition75%
+
invest/recognition25%
+
iso-25010/application-anchor50%
+
iso-25010/application-paraphrase75%
+
jobs-to-be-done/application-anchor50%
+
lasr/application-anchor50%
+
lasr/application-paraphrase75%
+
lasr/recognition50%
+
linddun/application-paraphrase50%
+
llm-evaluations/application-anchor75%
+
llm-evaluations/recognition75%
+
madr/application-anchor50%
+
madr/application-paraphrase75%
+
madr/recognition75%
+
mece/application-anchor50%
+
morphological-box/application-anchor25%
+
morphological-box/application-paraphrase75%
+
moscow/application-paraphrase50%
+
moscow/recognition75%
+
mutation-testing/application-anchor75%
+
mutation-testing/application-paraphrase25%
+
mutation-testing/recognition75%
+
nelson-rules/application-anchor50%
+
owasp-top-10/application-anchor75%
+
owasp-top-10/application-paraphrase50%
+
owasp-top-10/recognition75%
+
plain-english-strunk-white/application-anchor50%
+
plain-english-strunk-white/application-paraphrase75%
+
plain-english-strunk-white/recognition50%
+
prd/application-anchor50%
+
prd/recognition25%
+
problem-space-nvc/application-anchor25%
+
problem-space-nvc/application-paraphrase50%
+
problem-space-nvc/recognition75%
+
property-based-testing/application-anchor25%
+
property-based-testing/recognition75%
+
pyramid-principle/application-anchor75%
+
pyramid-principle/application-paraphrase75%
+
semantic-versioning/application-paraphrase75%
+
socratic-method/application-paraphrase75%
+
socratic-method/recognition25%
+
sota/application-anchor50%
+
sota/application-paraphrase75%
+
spc/application-anchor25%
+
spc/application-paraphrase75%
+
spc/recognition50%
+
stride/application-anchor50%
+
stride/recognition50%
+
swot/application-anchor25%
+
swot/application-paraphrase75%
+
tdd-chicago-school/application-paraphrase50%
+
tdd-chicago-school/recognition50%
+
tdd-london-school/application-anchor50%
+
tdd-london-school/application-paraphrase75%
+
tdd-london-school/consistency-language50%
+
tdd-london-school/consistency-variant-150%
+
tdd-london-school/consistency-variant-275%
+
tdd-london-school/consistency-variant-325%
+
tdd-london-school/recognition50%
+
testing-pyramid/application-paraphrase75%
+
timtowtdi/application-anchor75%
+
timtowtdi/recognition75%
+
todotxt-flavoured-markdown/application-anchor75%
+
todotxt-flavoured-markdown/application-paraphrase75%
+
todotxt-flavoured-markdown/recognition75%
+
user-story-mapping/application-paraphrase50%
+
user-story-mapping/recognition75%
+
wardley-mapping/application-anchor50%
+
wardley-mapping/application-paraphrase25%
+
wardley-mapping/recognition50%
+