Skip to content

Commit b2bf619

Browse files
raifdmuellerclaude
andcommitted
refactor: store summaries (17KB) instead of full results (220KB) per model
- New: evaluations/summaries/ — scores only, no raw responses or per-permutation details. 17KB vs 220KB per model. - evaluations/results/ added to .gitignore (full results reproducible via pilot.py) - Report generator reads from summaries/ - pilot.py auto-generates summary after each run - Fixed filename suffix to use exact model IDs (CodeRabbit feedback) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 4602bbe commit b2bf619

15 files changed

Lines changed: 4775 additions & 62665 deletions

evaluations/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
11
*.pyc
2+
# Full results with raw responses (large, reproducible via pilot.py)
3+
results/

evaluations/generate-report.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from collections import defaultdict
2020
from pathlib import Path
2121

22-
RESULTS_DIR = Path(__file__).parent / "results"
22+
RESULTS_DIR = Path(__file__).parent / "summaries"
2323
SPECS_DIR = Path(__file__).parent / "specs"
2424

2525
# Fallback display names (used when config doesn't have exact model ID)

evaluations/pilot.py

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -312,19 +312,15 @@ def run_pilot(models, dry_run=False, verbose=False, ollama_model="qwen3:4b", no_
312312
print()
313313

314314
ts = datetime.now().strftime("%Y%m%d-%H%M%S")
315-
# Include model names in filename to prevent race conditions on parallel runs
316-
model_suffix = "_".join(models)
315+
# Include exact model IDs in filename to prevent race conditions
316+
model_ids = []
317317
for m in models:
318-
if m == "openai":
319-
model_suffix = model_suffix.replace("openai", openai_model)
320-
elif m == "mistral":
321-
model_suffix = model_suffix.replace("mistral", mistral_model)
322-
elif m == "deepseek":
323-
model_suffix = model_suffix.replace("deepseek", deepseek_model)
324-
elif m == "ollama":
325-
model_suffix = model_suffix.replace("ollama", f"ollama-{ollama_model}")
326-
# Sanitize for filename
327-
model_suffix = model_suffix.replace(":", "-").replace("/", "-")
318+
if m == "openai": model_ids.append(openai_model)
319+
elif m == "mistral": model_ids.append(mistral_model)
320+
elif m == "deepseek": model_ids.append(deepseek_model)
321+
elif m == "ollama": model_ids.append(f"ollama-{ollama_model}")
322+
else: model_ids.append(m)
323+
model_suffix = "_".join(model_ids).replace(":", "-").replace("/", "-")
328324
out_file = RESULTS_DIR / f"pilot-{ts}_{model_suffix}.json"
329325

330326
all_results = {
@@ -462,6 +458,18 @@ def append_and_save(r):
462458
save_results(all_results, out_file)
463459
print(f"\nResults saved to {out_file}")
464460

461+
# Also save a stripped summary (scores only, no raw responses)
462+
summary_dir = RESULTS_DIR.parent / "summaries"
463+
summary_dir.mkdir(parents=True, exist_ok=True)
464+
summary = json.loads(json.dumps(all_results)) # deep copy
465+
for m_results in summary.get("models", {}).values():
466+
for r in m_results:
467+
r.pop("results", None)
468+
summary_file = summary_dir / out_file.name
469+
with open(summary_file, "w", encoding="utf-8") as fh:
470+
json.dump(summary, fh, indent=2, ensure_ascii=False)
471+
print(f"Summary saved to {summary_file}")
472+
465473
# Summary
466474
print("\n=== SUMMARY ===")
467475
print(f"Models: {', '.join(models)}")

0 commit comments

Comments
 (0)