LLM-Coding · rdmueller · Mar 26, 2026 · Mar 26, 2026 · Mar 26, 2026 · Mar 26, 2026
diff --git a/docs/anchor-evaluations.adoc b/docs/anchor-evaluations.adoc
@@ -407,45 +407,63 @@ For Claude, this means Sonnet 4.6 (not Opus).
 For GPT and Gemini, the mid-tier variants are not yet clearly established, so we test the current flagship (GPT-5, Gemini 2.5 Pro) and add smaller variants when they become available.
 A follow-up round with the cheapest variants (Haiku, GPT-5 mini, Gemini Flash) would reveal the lower boundary of anchor activation.
 
+IMPORTANT: Always record the *exact model identifier with date suffix* (e.g., `mistral-large-2512`, not `mistral-large-latest`).
+Model aliases like `-latest` can change without notice.
+
 *Commercial models (API cost per call):*
 
-[cols="1,2"]
+[cols="2,1,2"]
 |===
-|Model |Rationale
+|Model |API ID |Rationale
 
 |Claude Sonnet 4.6
+|`claude-sonnet-4-20250514`
 |Our primary development model. Serves as the baseline.
 
-|GPT-5
-|Largest market share, OpenAI ecosystem.
+|GPT-4o / GPT-5
+|`gpt-4o` / `gpt-5`
+|OpenAI ecosystem. GPT-4o as mid-tier, GPT-5 as flagship.
+
+|Mistral Large 3
+|`mistral-large-2512`
+|European flagship. Already tested (96%).
+
+|Mistral Medium 3.1
+|`mistral-medium-2508`
+|European mid-tier. Frontier-class multimodal.
+
+|Mistral Small 4
+|`mistral-small-2603`
+|European small model. Hybrid reasoning+coding (March 2026).
+
+|Devstral 2
+|`devstral-2512`
+|Code-specialized model. Tests whether SE-focused training improves anchor recognition.
 
 |Gemini 2.5 Pro
+|TBD
 |Google, different training approach.
 |===
 
-*Open-weight models (available as open-source):*
+*Open-weight models (run locally via Ollama):*
 
-[cols="1,2,1"]
+[cols="2,1,2"]
 |===
-|Model |Rationale |Local?
+|Model |Local? |Rationale
 
 |Llama 4 Maverick
-|Largest open-weight model. Shows whether anchors work without proprietary training -- relevant for self-hosted setups.
 |Yes (Ollama)
-
-|Mistral Large
-|European model with a different training focus. Interesting because the anchor catalog is heavily influenced by English-language software engineering literature.
-|No (too large, use La Plateforme API)
+|Largest open-weight model. Shows whether anchors work without proprietary training.
 
 |DeepSeek V3
+|Yes (Ollama)
 |Chinese model. Tests whether anchors work across cultural and training-data boundaries.
+
+|Ministral 3 8B
 |Yes (Ollama)
+|Mistral's tiny model. Lower boundary test.
 |===
 
-Llama and DeepSeek can run locally (e.g., via Ollama) at no API cost.
-Mistral Large requires the Mistral API -- it is open-weight but too large for local inference.
-This means 4 models have API costs (Claude, GPT, Gemini, Mistral) and 2 run locally for free.
-
 === Effort Estimate
 
 Each question runs 4 times (randomized option order) to control for position bias.

diff --git a/evaluations/generate-report.py b/evaluations/generate-report.py
@@ -22,32 +22,55 @@
 RESULTS_DIR = Path(__file__).parent / "results"
 SPECS_DIR = Path(__file__).parent / "specs"
 
-# Models to include and display order
-MODEL_DISPLAY = {
+# Fallback display names (used when config doesn't have exact model ID)
+MODEL_DISPLAY_FALLBACK = {
     "claude": "Claude Sonnet",
     "claude-cli": "Claude Sonnet (CLI)",
     "claude-haiku": "Claude Haiku",
     "openai": "GPT-4o",
-    "mistral": "Mistral Large",
+    "mistral": "Mistral",
     "ollama": "Ollama (local)",
 }
 
+
+def get_model_display(backend, config):
+    """Get exact model display name from config."""
+    if backend == "openai" and config.get("openai_model"):
+        return config["openai_model"]
+    if backend == "mistral" and config.get("mistral_model"):
+        return config["mistral_model"]
+    if backend == "deepseek" and config.get("deepseek_model"):
+        return config["deepseek_model"]
+    if backend == "ollama" and config.get("ollama_model"):
+        return f"ollama/{config['ollama_model']}"
+    if backend == "claude":
+        return "claude-sonnet-4-20250514"
+    if backend == "claude-cli":
+        return "claude-sonnet-4 (CLI)"
+    if backend == "claude-haiku":
+        return "claude-haiku-4-5"
+    return MODEL_DISPLAY_FALLBACK.get(backend, backend)
+
 CONTROL_ANCHORS = {"sanity-check", "negative-control"}
 
 
 def load_best_results():
-    """Load the latest result with the most questions per model."""
+    """Load the latest result per unique model identifier."""
     results = {}
     for f in sorted(RESULTS_DIR.glob("pilot-*.json")):
         d = json.load(open(f, encoding="utf-8"))
+        config = d.get("config", {})
         for m, r in d["models"].items():
-            if m not in results or len(r) >= len(results[m]["data"]):
-                results[m] = {
+            # Use exact model ID as key instead of backend alias
+            exact_id = get_model_display(m, config)
+            if exact_id not in results or len(r) >= len(results[exact_id]["data"]):
+                results[exact_id] = {
                     "data": r,
                     "file": f.name,
-                    "config": d.get("config", {}),
+                    "config": config,
                     "duration": d.get("duration_seconds", 0),
                     "timestamp": d.get("timestamp", ""),
+                    "backend": m,
                 }
     return results
 
@@ -71,19 +94,15 @@ def score_bg(score):
 
 
 def generate_html(results, output_path):
+    # Keys are already exact model IDs (e.g. "mistral-large-2512")
+    display_names = {m: m for m in results}
+
     # Collect all anchors and questions
     all_questions = defaultdict(dict)  # anchor/label -> {model: score}
-    model_names = []
-
-    # Prefer full runs (75 questions) over pilot runs
-    for m in ["claude", "openai", "mistral"]:
-        if m in results and len(results[m]["data"]) >= 60:
-            model_names.append(m)
 
-    # Add smaller runs if no full run exists
-    for m in ["claude-cli", "claude-haiku", "ollama"]:
-        if m in results and m not in model_names:
-            model_names.append(m)
+    # Sort models: most questions first, then alphabetically
+    model_names = sorted(results.keys(),
+                         key=lambda m: (-len(results[m]["data"]), m))
 
     for m in model_names:
         for q in results[m]["data"]:
@@ -162,7 +181,7 @@ def generate_html(results, output_path):
 
     for m in model_names:
         avg = model_avgs.get(m, 0)
-        display = MODEL_DISPLAY.get(m, m)
+        display = display_names.get(m, m)
         n = len([1 for l in anchor_questions if anchor_questions[l].get(m) is not None])
         info = results[m]
         html += f"""  <div class="summary-card">
@@ -181,7 +200,7 @@ def generate_html(results, output_path):
 """
 
     for m in model_names:
-        html += f"  <th style='text-align:center'>{MODEL_DISPLAY.get(m, m)}</th>\n"
+        html += f"  <th style='text-align:center'>{display_names.get(m, m)}</th>\n"
     html += "</tr></thead>\n<tbody>\n"
 
     for anchor_id in sorted(anchor_groups.keys()):
@@ -224,7 +243,7 @@ def generate_html(results, output_path):
     if control_questions:
         html += '<h2>Control Questions</h2>\n<table class="controls">\n<thead><tr><th>Control</th>'
         for m in model_names:
-            html += f"<th style='text-align:center'>{MODEL_DISPLAY.get(m, m)}</th>"
+            html += f"<th style='text-align:center'>{display_names.get(m, m)}</th>"
         html += "</tr></thead>\n<tbody>\n"
         for label in sorted(control_questions.keys()):
             short = label.replace("/recognition", "")
@@ -246,9 +265,9 @@ def generate_html(results, output_path):
         fails = [(q["label"], q["score"]) for q in results[m]["data"]
                  if q["score"] < 1.0 and not any(q["label"].startswith(c) for c in CONTROL_ANCHORS)]
         if not fails:
-            html += f"<h3>{MODEL_DISPLAY.get(m, m)}: no failures</h3>\n"
+            html += f"<h3>{display_names.get(m, m)}: no failures</h3>\n"
         else:
-            html += f'<h3>{MODEL_DISPLAY.get(m, m)}: {len(fails)} failures</h3>\n<div class="fail-list">\n'
+            html += f'<h3>{display_names.get(m, m)}: {len(fails)} failures</h3>\n<div class="fail-list">\n'
             for label, score in sorted(fails):
                 html += f'<div class="fail-item"><span>{h(label)}</span><span style="color:{score_color(score)};font-weight:600">{score:.0%}</span></div>\n'
             html += "</div>\n"
@@ -262,7 +281,7 @@ def generate_html(results, output_path):
     for m in model_names:
         info = results[m]
         dur = info["duration"]
-        html += f"<dt>{MODEL_DISPLAY.get(m, m)}:</dt><dd>{info['file']} · {int(dur//60)}m {int(dur%60)}s · {info['timestamp'][:19]}</dd><br>"
+        html += f"<dt>{display_names.get(m, m)}:</dt><dd>{info['file']} · {int(dur//60)}m {int(dur%60)}s · {info['timestamp'][:19]}</dd><br>"
 
     html += """
 </dl>

diff --git a/evaluations/pilot.py b/evaluations/pilot.py
@@ -312,7 +312,20 @@ def run_pilot(models, dry_run=False, verbose=False, ollama_model="qwen3:4b", no_
     print()
 
     ts = datetime.now().strftime("%Y%m%d-%H%M%S")
-    out_file = RESULTS_DIR / f"pilot-{ts}.json"
+    # Include model names in filename to prevent race conditions on parallel runs
+    model_suffix = "_".join(models)
+    for m in models:
+        if m == "openai":
+            model_suffix = model_suffix.replace("openai", openai_model)
+        elif m == "mistral":
+            model_suffix = model_suffix.replace("mistral", mistral_model)
+        elif m == "deepseek":
+            model_suffix = model_suffix.replace("deepseek", deepseek_model)
+        elif m == "ollama":
+            model_suffix = model_suffix.replace("ollama", f"ollama-{ollama_model}")
+    # Sanitize for filename
+    model_suffix = model_suffix.replace(":", "-").replace("/", "-")
+    out_file = RESULTS_DIR / f"pilot-{ts}_{model_suffix}.json"
 
     all_results = {
         "timestamp": datetime.now(timezone.utc).isoformat(),