Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 34 additions & 16 deletions docs/anchor-evaluations.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -407,45 +407,63 @@ For Claude, this means Sonnet 4.6 (not Opus).
For GPT and Gemini, the mid-tier variants are not yet clearly established, so we test the current flagship (GPT-5, Gemini 2.5 Pro) and add smaller variants when they become available.
A follow-up round with the cheapest variants (Haiku, GPT-5 mini, Gemini Flash) would reveal the lower boundary of anchor activation.

IMPORTANT: Always record the *exact model identifier with date suffix* (e.g., `mistral-large-2512`, not `mistral-large-latest`).
Model aliases like `-latest` can change without notice.

Comment thread
raifdmueller marked this conversation as resolved.
*Commercial models (API cost per call):*

[cols="1,2"]
[cols="2,1,2"]
|===
|Model |Rationale
|Model |API ID |Rationale

|Claude Sonnet 4.6
|`claude-sonnet-4-20250514`
|Our primary development model. Serves as the baseline.

|GPT-5
|Largest market share, OpenAI ecosystem.
|GPT-4o / GPT-5
|`gpt-4o` / `gpt-5`
|OpenAI ecosystem. GPT-4o as mid-tier, GPT-5 as flagship.

|Mistral Large 3
|`mistral-large-2512`
|European flagship. Already tested (96%).

|Mistral Medium 3.1
|`mistral-medium-2508`
|European mid-tier. Frontier-class multimodal.

|Mistral Small 4
|`mistral-small-2603`
|European small model. Hybrid reasoning+coding (March 2026).

|Devstral 2
|`devstral-2512`
|Code-specialized model. Tests whether SE-focused training improves anchor recognition.

|Gemini 2.5 Pro
|TBD
|Google, different training approach.
|===

*Open-weight models (available as open-source):*
*Open-weight models (run locally via Ollama):*

[cols="1,2,1"]
[cols="2,1,2"]
|===
|Model |Rationale |Local?
|Model |Local? |Rationale

|Llama 4 Maverick
|Largest open-weight model. Shows whether anchors work without proprietary training -- relevant for self-hosted setups.
|Yes (Ollama)

|Mistral Large
|European model with a different training focus. Interesting because the anchor catalog is heavily influenced by English-language software engineering literature.
|No (too large, use La Plateforme API)
|Largest open-weight model. Shows whether anchors work without proprietary training.

|DeepSeek V3
|Yes (Ollama)
|Chinese model. Tests whether anchors work across cultural and training-data boundaries.

|Ministral 3 8B
|Yes (Ollama)
|Mistral's tiny model. Lower boundary test.
|===

Llama and DeepSeek can run locally (e.g., via Ollama) at no API cost.
Mistral Large requires the Mistral API -- it is open-weight but too large for local inference.
This means 4 models have API costs (Claude, GPT, Gemini, Mistral) and 2 run locally for free.

=== Effort Estimate

Each question runs 4 times (randomized option order) to control for position bias.
Expand Down
65 changes: 42 additions & 23 deletions evaluations/generate-report.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,32 +22,55 @@
RESULTS_DIR = Path(__file__).parent / "results"
SPECS_DIR = Path(__file__).parent / "specs"

# Models to include and display order
MODEL_DISPLAY = {
# Fallback display names (used when config doesn't have exact model ID)
MODEL_DISPLAY_FALLBACK = {
"claude": "Claude Sonnet",
"claude-cli": "Claude Sonnet (CLI)",
"claude-haiku": "Claude Haiku",
"openai": "GPT-4o",
"mistral": "Mistral Large",
"mistral": "Mistral",
"ollama": "Ollama (local)",
}


def get_model_display(backend, config):
"""Get exact model display name from config."""
if backend == "openai" and config.get("openai_model"):
return config["openai_model"]
if backend == "mistral" and config.get("mistral_model"):
return config["mistral_model"]
if backend == "deepseek" and config.get("deepseek_model"):
return config["deepseek_model"]
if backend == "ollama" and config.get("ollama_model"):
return f"ollama/{config['ollama_model']}"
if backend == "claude":
return "claude-sonnet-4-20250514"
if backend == "claude-cli":
return "claude-sonnet-4 (CLI)"
if backend == "claude-haiku":
return "claude-haiku-4-5"
return MODEL_DISPLAY_FALLBACK.get(backend, backend)
Comment thread
raifdmueller marked this conversation as resolved.

CONTROL_ANCHORS = {"sanity-check", "negative-control"}


def load_best_results():
"""Load the latest result with the most questions per model."""
"""Load the latest result per unique model identifier."""
results = {}
for f in sorted(RESULTS_DIR.glob("pilot-*.json")):
d = json.load(open(f, encoding="utf-8"))
config = d.get("config", {})
for m, r in d["models"].items():
if m not in results or len(r) >= len(results[m]["data"]):
results[m] = {
# Use exact model ID as key instead of backend alias
exact_id = get_model_display(m, config)
if exact_id not in results or len(r) >= len(results[exact_id]["data"]):
results[exact_id] = {
"data": r,
"file": f.name,
"config": d.get("config", {}),
"config": config,
"duration": d.get("duration_seconds", 0),
"timestamp": d.get("timestamp", ""),
"backend": m,
}
return results

Expand All @@ -71,19 +94,15 @@ def score_bg(score):


def generate_html(results, output_path):
# Keys are already exact model IDs (e.g. "mistral-large-2512")
display_names = {m: m for m in results}

# Collect all anchors and questions
all_questions = defaultdict(dict) # anchor/label -> {model: score}
model_names = []

# Prefer full runs (75 questions) over pilot runs
for m in ["claude", "openai", "mistral"]:
if m in results and len(results[m]["data"]) >= 60:
model_names.append(m)

# Add smaller runs if no full run exists
for m in ["claude-cli", "claude-haiku", "ollama"]:
if m in results and m not in model_names:
model_names.append(m)
# Sort models: most questions first, then alphabetically
model_names = sorted(results.keys(),
key=lambda m: (-len(results[m]["data"]), m))

for m in model_names:
for q in results[m]["data"]:
Expand Down Expand Up @@ -162,7 +181,7 @@ def generate_html(results, output_path):

for m in model_names:
avg = model_avgs.get(m, 0)
display = MODEL_DISPLAY.get(m, m)
display = display_names.get(m, m)
n = len([1 for l in anchor_questions if anchor_questions[l].get(m) is not None])
info = results[m]
html += f""" <div class="summary-card">
Expand All @@ -181,7 +200,7 @@ def generate_html(results, output_path):
"""

for m in model_names:
html += f" <th style='text-align:center'>{MODEL_DISPLAY.get(m, m)}</th>\n"
html += f" <th style='text-align:center'>{display_names.get(m, m)}</th>\n"
html += "</tr></thead>\n<tbody>\n"

for anchor_id in sorted(anchor_groups.keys()):
Expand Down Expand Up @@ -224,7 +243,7 @@ def generate_html(results, output_path):
if control_questions:
html += '<h2>Control Questions</h2>\n<table class="controls">\n<thead><tr><th>Control</th>'
for m in model_names:
html += f"<th style='text-align:center'>{MODEL_DISPLAY.get(m, m)}</th>"
html += f"<th style='text-align:center'>{display_names.get(m, m)}</th>"
html += "</tr></thead>\n<tbody>\n"
for label in sorted(control_questions.keys()):
short = label.replace("/recognition", "")
Expand All @@ -246,9 +265,9 @@ def generate_html(results, output_path):
fails = [(q["label"], q["score"]) for q in results[m]["data"]
if q["score"] < 1.0 and not any(q["label"].startswith(c) for c in CONTROL_ANCHORS)]
if not fails:
html += f"<h3>{MODEL_DISPLAY.get(m, m)}: no failures</h3>\n"
html += f"<h3>{display_names.get(m, m)}: no failures</h3>\n"
else:
html += f'<h3>{MODEL_DISPLAY.get(m, m)}: {len(fails)} failures</h3>\n<div class="fail-list">\n'
html += f'<h3>{display_names.get(m, m)}: {len(fails)} failures</h3>\n<div class="fail-list">\n'
for label, score in sorted(fails):
html += f'<div class="fail-item"><span>{h(label)}</span><span style="color:{score_color(score)};font-weight:600">{score:.0%}</span></div>\n'
html += "</div>\n"
Expand All @@ -262,7 +281,7 @@ def generate_html(results, output_path):
for m in model_names:
info = results[m]
dur = info["duration"]
html += f"<dt>{MODEL_DISPLAY.get(m, m)}:</dt><dd>{info['file']} · {int(dur//60)}m {int(dur%60)}s · {info['timestamp'][:19]}</dd><br>"
html += f"<dt>{display_names.get(m, m)}:</dt><dd>{info['file']} · {int(dur//60)}m {int(dur%60)}s · {info['timestamp'][:19]}</dd><br>"

html += """
</dl>
Expand Down
15 changes: 14 additions & 1 deletion evaluations/pilot.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,20 @@ def run_pilot(models, dry_run=False, verbose=False, ollama_model="qwen3:4b", no_
print()

ts = datetime.now().strftime("%Y%m%d-%H%M%S")
out_file = RESULTS_DIR / f"pilot-{ts}.json"
# Include model names in filename to prevent race conditions on parallel runs
model_suffix = "_".join(models)
for m in models:
if m == "openai":
model_suffix = model_suffix.replace("openai", openai_model)
elif m == "mistral":
model_suffix = model_suffix.replace("mistral", mistral_model)
elif m == "deepseek":
model_suffix = model_suffix.replace("deepseek", deepseek_model)
elif m == "ollama":
model_suffix = model_suffix.replace("ollama", f"ollama-{ollama_model}")
# Sanitize for filename
model_suffix = model_suffix.replace(":", "-").replace("/", "-")
out_file = RESULTS_DIR / f"pilot-{ts}_{model_suffix}.json"
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated

all_results = {
"timestamp": datetime.now(timezone.utc).isoformat(),
Expand Down
Loading
Loading