Skip to content

Commit 0e0a749

Browse files
authored
Merge pull request #353 from raifdmueller/feat/mistral-models-evaluation
feat: evaluate 6 models including Mistral Small/Medium/Devstral
2 parents ceee4d0 + b2bf619 commit 0e0a749

15 files changed

Lines changed: 5857 additions & 31899 deletions

docs/anchor-evaluations.adoc

Lines changed: 34 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -407,45 +407,63 @@ For Claude, this means Sonnet 4.6 (not Opus).
407407
For GPT and Gemini, the mid-tier variants are not yet clearly established, so we test the current flagship (GPT-5, Gemini 2.5 Pro) and add smaller variants when they become available.
408408
A follow-up round with the cheapest variants (Haiku, GPT-5 mini, Gemini Flash) would reveal the lower boundary of anchor activation.
409409

410+
IMPORTANT: Always record the *exact model identifier with date suffix* (e.g., `mistral-large-2512`, not `mistral-large-latest`).
411+
Model aliases like `-latest` can change without notice.
412+
410413
*Commercial models (API cost per call):*
411414

412-
[cols="1,2"]
415+
[cols="2,1,2"]
413416
|===
414-
|Model |Rationale
417+
|Model |API ID |Rationale
415418

416419
|Claude Sonnet 4.6
420+
|`claude-sonnet-4-20250514`
417421
|Our primary development model. Serves as the baseline.
418422

419-
|GPT-5
420-
|Largest market share, OpenAI ecosystem.
423+
|GPT-4o / GPT-5
424+
|`gpt-4o` / `gpt-5`
425+
|OpenAI ecosystem. GPT-4o as mid-tier, GPT-5 as flagship.
426+
427+
|Mistral Large 3
428+
|`mistral-large-2512`
429+
|European flagship. Already tested (96%).
430+
431+
|Mistral Medium 3.1
432+
|`mistral-medium-2508`
433+
|European mid-tier. Frontier-class multimodal.
434+
435+
|Mistral Small 4
436+
|`mistral-small-2603`
437+
|European small model. Hybrid reasoning+coding (March 2026).
438+
439+
|Devstral 2
440+
|`devstral-2512`
441+
|Code-specialized model. Tests whether SE-focused training improves anchor recognition.
421442

422443
|Gemini 2.5 Pro
444+
|TBD
423445
|Google, different training approach.
424446
|===
425447

426-
*Open-weight models (available as open-source):*
448+
*Open-weight models (run locally via Ollama):*
427449

428-
[cols="1,2,1"]
450+
[cols="2,1,2"]
429451
|===
430-
|Model |Rationale |Local?
452+
|Model |Local? |Rationale
431453

432454
|Llama 4 Maverick
433-
|Largest open-weight model. Shows whether anchors work without proprietary training -- relevant for self-hosted setups.
434455
|Yes (Ollama)
435-
436-
|Mistral Large
437-
|European model with a different training focus. Interesting because the anchor catalog is heavily influenced by English-language software engineering literature.
438-
|No (too large, use La Plateforme API)
456+
|Largest open-weight model. Shows whether anchors work without proprietary training.
439457

440458
|DeepSeek V3
459+
|Yes (Ollama)
441460
|Chinese model. Tests whether anchors work across cultural and training-data boundaries.
461+
462+
|Ministral 3 8B
442463
|Yes (Ollama)
464+
|Mistral's tiny model. Lower boundary test.
443465
|===
444466

445-
Llama and DeepSeek can run locally (e.g., via Ollama) at no API cost.
446-
Mistral Large requires the Mistral API -- it is open-weight but too large for local inference.
447-
This means 4 models have API costs (Claude, GPT, Gemini, Mistral) and 2 run locally for free.
448-
449467
=== Effort Estimate
450468

451469
Each question runs 4 times (randomized option order) to control for position bias.

evaluations/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
11
*.pyc
2+
# Full results with raw responses (large, reproducible via pilot.py)
3+
results/

evaluations/generate-report.py

Lines changed: 43 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -19,35 +19,58 @@
1919
from collections import defaultdict
2020
from pathlib import Path
2121

22-
RESULTS_DIR = Path(__file__).parent / "results"
22+
RESULTS_DIR = Path(__file__).parent / "summaries"
2323
SPECS_DIR = Path(__file__).parent / "specs"
2424

25-
# Models to include and display order
26-
MODEL_DISPLAY = {
25+
# Fallback display names (used when config doesn't have exact model ID)
26+
MODEL_DISPLAY_FALLBACK = {
2727
"claude": "Claude Sonnet",
2828
"claude-cli": "Claude Sonnet (CLI)",
2929
"claude-haiku": "Claude Haiku",
3030
"openai": "GPT-4o",
31-
"mistral": "Mistral Large",
31+
"mistral": "Mistral",
3232
"ollama": "Ollama (local)",
3333
}
3434

35+
36+
def get_model_display(backend, config):
37+
"""Get exact model display name from config."""
38+
if backend == "openai" and config.get("openai_model"):
39+
return config["openai_model"]
40+
if backend == "mistral" and config.get("mistral_model"):
41+
return config["mistral_model"]
42+
if backend == "deepseek" and config.get("deepseek_model"):
43+
return config["deepseek_model"]
44+
if backend == "ollama" and config.get("ollama_model"):
45+
return f"ollama/{config['ollama_model']}"
46+
if backend == "claude":
47+
return "claude-sonnet-4-20250514"
48+
if backend == "claude-cli":
49+
return "claude-sonnet-4 (CLI)"
50+
if backend == "claude-haiku":
51+
return "claude-haiku-4-5"
52+
return MODEL_DISPLAY_FALLBACK.get(backend, backend)
53+
3554
CONTROL_ANCHORS = {"sanity-check", "negative-control"}
3655

3756

3857
def load_best_results():
39-
"""Load the latest result with the most questions per model."""
58+
"""Load the latest result per unique model identifier."""
4059
results = {}
4160
for f in sorted(RESULTS_DIR.glob("pilot-*.json")):
4261
d = json.load(open(f, encoding="utf-8"))
62+
config = d.get("config", {})
4363
for m, r in d["models"].items():
44-
if m not in results or len(r) >= len(results[m]["data"]):
45-
results[m] = {
64+
# Use exact model ID as key instead of backend alias
65+
exact_id = get_model_display(m, config)
66+
if exact_id not in results or len(r) >= len(results[exact_id]["data"]):
67+
results[exact_id] = {
4668
"data": r,
4769
"file": f.name,
48-
"config": d.get("config", {}),
70+
"config": config,
4971
"duration": d.get("duration_seconds", 0),
5072
"timestamp": d.get("timestamp", ""),
73+
"backend": m,
5174
}
5275
return results
5376

@@ -71,19 +94,15 @@ def score_bg(score):
7194

7295

7396
def generate_html(results, output_path):
97+
# Keys are already exact model IDs (e.g. "mistral-large-2512")
98+
display_names = {m: m for m in results}
99+
74100
# Collect all anchors and questions
75101
all_questions = defaultdict(dict) # anchor/label -> {model: score}
76-
model_names = []
77-
78-
# Prefer full runs (75 questions) over pilot runs
79-
for m in ["claude", "openai", "mistral"]:
80-
if m in results and len(results[m]["data"]) >= 60:
81-
model_names.append(m)
82102

83-
# Add smaller runs if no full run exists
84-
for m in ["claude-cli", "claude-haiku", "ollama"]:
85-
if m in results and m not in model_names:
86-
model_names.append(m)
103+
# Sort models: most questions first, then alphabetically
104+
model_names = sorted(results.keys(),
105+
key=lambda m: (-len(results[m]["data"]), m))
87106

88107
for m in model_names:
89108
for q in results[m]["data"]:
@@ -162,7 +181,7 @@ def generate_html(results, output_path):
162181

163182
for m in model_names:
164183
avg = model_avgs.get(m, 0)
165-
display = MODEL_DISPLAY.get(m, m)
184+
display = display_names.get(m, m)
166185
n = len([1 for l in anchor_questions if anchor_questions[l].get(m) is not None])
167186
info = results[m]
168187
html += f""" <div class="summary-card">
@@ -181,7 +200,7 @@ def generate_html(results, output_path):
181200
"""
182201

183202
for m in model_names:
184-
html += f" <th style='text-align:center'>{MODEL_DISPLAY.get(m, m)}</th>\n"
203+
html += f" <th style='text-align:center'>{display_names.get(m, m)}</th>\n"
185204
html += "</tr></thead>\n<tbody>\n"
186205

187206
for anchor_id in sorted(anchor_groups.keys()):
@@ -224,7 +243,7 @@ def generate_html(results, output_path):
224243
if control_questions:
225244
html += '<h2>Control Questions</h2>\n<table class="controls">\n<thead><tr><th>Control</th>'
226245
for m in model_names:
227-
html += f"<th style='text-align:center'>{MODEL_DISPLAY.get(m, m)}</th>"
246+
html += f"<th style='text-align:center'>{display_names.get(m, m)}</th>"
228247
html += "</tr></thead>\n<tbody>\n"
229248
for label in sorted(control_questions.keys()):
230249
short = label.replace("/recognition", "")
@@ -246,9 +265,9 @@ def generate_html(results, output_path):
246265
fails = [(q["label"], q["score"]) for q in results[m]["data"]
247266
if q["score"] < 1.0 and not any(q["label"].startswith(c) for c in CONTROL_ANCHORS)]
248267
if not fails:
249-
html += f"<h3>{MODEL_DISPLAY.get(m, m)}: no failures</h3>\n"
268+
html += f"<h3>{display_names.get(m, m)}: no failures</h3>\n"
250269
else:
251-
html += f'<h3>{MODEL_DISPLAY.get(m, m)}: {len(fails)} failures</h3>\n<div class="fail-list">\n'
270+
html += f'<h3>{display_names.get(m, m)}: {len(fails)} failures</h3>\n<div class="fail-list">\n'
252271
for label, score in sorted(fails):
253272
html += f'<div class="fail-item"><span>{h(label)}</span><span style="color:{score_color(score)};font-weight:600">{score:.0%}</span></div>\n'
254273
html += "</div>\n"
@@ -262,7 +281,7 @@ def generate_html(results, output_path):
262281
for m in model_names:
263282
info = results[m]
264283
dur = info["duration"]
265-
html += f"<dt>{MODEL_DISPLAY.get(m, m)}:</dt><dd>{info['file']} · {int(dur//60)}m {int(dur%60)}s · {info['timestamp'][:19]}</dd><br>"
284+
html += f"<dt>{display_names.get(m, m)}:</dt><dd>{info['file']} · {int(dur//60)}m {int(dur%60)}s · {info['timestamp'][:19]}</dd><br>"
266285

267286
html += """
268287
</dl>

evaluations/pilot.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,16 @@ def run_pilot(models, dry_run=False, verbose=False, ollama_model="qwen3:4b", no_
312312
print()
313313

314314
ts = datetime.now().strftime("%Y%m%d-%H%M%S")
315-
out_file = RESULTS_DIR / f"pilot-{ts}.json"
315+
# Include exact model IDs in filename to prevent race conditions
316+
model_ids = []
317+
for m in models:
318+
if m == "openai": model_ids.append(openai_model)
319+
elif m == "mistral": model_ids.append(mistral_model)
320+
elif m == "deepseek": model_ids.append(deepseek_model)
321+
elif m == "ollama": model_ids.append(f"ollama-{ollama_model}")
322+
else: model_ids.append(m)
323+
model_suffix = "_".join(model_ids).replace(":", "-").replace("/", "-")
324+
out_file = RESULTS_DIR / f"pilot-{ts}_{model_suffix}.json"
316325

317326
all_results = {
318327
"timestamp": datetime.now(timezone.utc).isoformat(),
@@ -449,6 +458,18 @@ def append_and_save(r):
449458
save_results(all_results, out_file)
450459
print(f"\nResults saved to {out_file}")
451460

461+
# Also save a stripped summary (scores only, no raw responses)
462+
summary_dir = RESULTS_DIR.parent / "summaries"
463+
summary_dir.mkdir(parents=True, exist_ok=True)
464+
summary = json.loads(json.dumps(all_results)) # deep copy
465+
for m_results in summary.get("models", {}).values():
466+
for r in m_results:
467+
r.pop("results", None)
468+
summary_file = summary_dir / out_file.name
469+
with open(summary_file, "w", encoding="utf-8") as fh:
470+
json.dump(summary, fh, indent=2, ensure_ascii=False)
471+
print(f"Summary saved to {summary_file}")
472+
452473
# Summary
453474
print("\n=== SUMMARY ===")
454475
print(f"Models: {', '.join(models)}")

0 commit comments

Comments
 (0)