|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +Generate token-efficiency scatter plot for LongMemEval systems. |
| 4 | +
|
| 5 | +Metric: tokens per correct answer |
| 6 | + = (total_tokens / total_questions) / accuracy_rate |
| 7 | +
|
| 8 | +Axis convention: higher + more to the right = better. |
| 9 | +- X: accuracy (higher right = better) |
| 10 | +- Y: tokens/correct, log scale, INVERTED (fewer tokens = higher on plot = better) |
| 11 | +
|
| 12 | +AXME tokens are MEASURED from our 500-question run. |
| 13 | +Competitor tokens are ESTIMATED from their published methodology |
| 14 | +(Observer/Reflector calls, fact extraction, graph construction, etc.). |
| 15 | +
|
| 16 | +Rationale for tokens vs dollars: |
| 17 | +- Model-agnostic (Sonnet, gpt-4o, gpt-5-mini — price changes, token counts don't) |
| 18 | +- Measures architecture efficiency independent of LLM provider |
| 19 | +- Cannot be disputed by "but your pricing is wrong" arguments |
| 20 | +""" |
| 21 | + |
| 22 | +import matplotlib.pyplot as plt |
| 23 | + |
| 24 | +# ─── Data ───────────────────────────────────────────────────────────── |
| 25 | + |
| 26 | +# Format: (label, tokens_per_question, accuracy_pct, model, color, is_axme, measured) |
| 27 | +systems = [ |
| 28 | + ("AXME Code", 9_100, 89.20, "Sonnet 4.6", "#4ab8ff", True, True), |
| 29 | + ("Mastra OM", 100_000, 94.87, "gpt-5-mini", "#b080e8", False, False), |
| 30 | + ("Mastra OM", 100_000, 84.23, "gpt-4o", "#b080e8", False, False), |
| 31 | + ("Supermemory", 25_000, 85.40, "gpt-4o", "#e880b0", False, False), |
| 32 | + ("Zep", 50_000, 71.20, "gpt-4o", "#e8a880", False, False), |
| 33 | + ("Mem0", 15_000, 49.00, "gpt-4o", "#80e8a8", False, False), |
| 34 | +] |
| 35 | + |
| 36 | + |
| 37 | +def tokens_per_correct(tokens_per_q: int, accuracy_pct: float) -> float: |
| 38 | + return tokens_per_q / (accuracy_pct / 100) |
| 39 | + |
| 40 | + |
| 41 | +# ─── Plot ───────────────────────────────────────────────────────────── |
| 42 | + |
| 43 | +plt.style.use("dark_background") |
| 44 | +fig, ax = plt.subplots(figsize=(10, 7), facecolor="#1a1a1a") |
| 45 | +ax.set_facecolor("#1a1a1a") |
| 46 | + |
| 47 | +ax.grid(True, alpha=0.15, linestyle="--", color="#888") |
| 48 | +ax.set_axisbelow(True) |
| 49 | + |
| 50 | +for label, tpq, acc, model, color, is_axme, measured in systems: |
| 51 | + tpc = tokens_per_correct(tpq, acc) |
| 52 | + size = 380 if is_axme else 220 |
| 53 | + edge = "white" if is_axme else "#555" |
| 54 | + lw = 2.5 if is_axme else 1.0 |
| 55 | + |
| 56 | + # X = accuracy, Y = tokens/correct |
| 57 | + ax.scatter(acc, tpc, s=size, c=color, edgecolors=edge, linewidths=lw, |
| 58 | + zorder=3, alpha=0.95) |
| 59 | + |
| 60 | + display_label = f"{label}\n({model})" |
| 61 | + fontweight = "bold" if is_axme else "normal" |
| 62 | + fontsize = 11 if is_axme else 10 |
| 63 | + |
| 64 | + if is_axme: |
| 65 | + # AXME is top-left — label below the point |
| 66 | + ax.annotate(display_label, (acc, tpc), xytext=(0, -32), |
| 67 | + textcoords="offset points", color="white", |
| 68 | + ha="center", |
| 69 | + fontsize=fontsize, fontweight=fontweight) |
| 70 | + else: |
| 71 | + offsets = { |
| 72 | + ("Mastra OM", "gpt-5-mini"): (-14, 6), # upper-right area, label to left |
| 73 | + ("Mastra OM", "gpt-4o"): (-14, 6), |
| 74 | + ("Supermemory", "gpt-4o"): (14, 6), |
| 75 | + ("Zep", "gpt-4o"): (14, 6), |
| 76 | + ("Mem0", "gpt-4o"): (14, 6), |
| 77 | + } |
| 78 | + ha_map = { |
| 79 | + ("Mastra OM", "gpt-5-mini"): "right", |
| 80 | + ("Mastra OM", "gpt-4o"): "right", |
| 81 | + } |
| 82 | + dx, dy = offsets.get((label, model), (14, 6)) |
| 83 | + ha = ha_map.get((label, model), "left") |
| 84 | + ax.annotate(display_label, (acc, tpc), xytext=(dx, dy), |
| 85 | + textcoords="offset points", color="#ccc", ha=ha, |
| 86 | + fontsize=fontsize, fontweight=fontweight) |
| 87 | + |
| 88 | +# Axis labels (note: Y is inverted, so the label reflects it) |
| 89 | +ax.set_xlabel("LongMemEval E2E accuracy (%)", color="white", |
| 90 | + fontsize=12, labelpad=10) |
| 91 | +ax.set_ylabel("Tokens per correct answer (log scale, fewer = better)", color="white", |
| 92 | + fontsize=12, labelpad=10) |
| 93 | + |
| 94 | +# Log-scale Y, INVERTED so that fewer tokens = higher on plot |
| 95 | +ax.set_yscale("log") |
| 96 | +ax.set_ylim(300_000, 7_000) # inverted: high value first, low value second |
| 97 | +ax.set_xlim(40, 100) |
| 98 | + |
| 99 | +# Y tick formatter |
| 100 | +def fmt_tokens(y, _): |
| 101 | + if y >= 1_000_000: |
| 102 | + return f"{y/1_000_000:.0f}M" |
| 103 | + if y >= 1_000: |
| 104 | + return f"{y/1_000:.0f}K" |
| 105 | + return str(int(y)) |
| 106 | +ax.yaxis.set_major_formatter(plt.FuncFormatter(fmt_tokens)) |
| 107 | + |
| 108 | +for spine in ax.spines.values(): |
| 109 | + spine.set_edgecolor("#444") |
| 110 | + |
| 111 | +ax.tick_params(colors="#bbb", which="both") |
| 112 | + |
| 113 | +ax.set_title("Memory Systems: Token Efficiency on LongMemEval", |
| 114 | + color="white", fontsize=14, fontweight="bold", pad=20) |
| 115 | + |
| 116 | +# "Top-right = best" hint in the bottom-left corner |
| 117 | +ax.text(0.03, 0.05, "↗ Top-right = best (high accuracy, fewer tokens)", |
| 118 | + transform=ax.transAxes, ha="left", va="bottom", |
| 119 | + fontsize=9, color="#888", style="italic") |
| 120 | + |
| 121 | +# Callout next to the AXME point |
| 122 | +ax.annotate("AXME Code uses ~10× fewer tokens\nthan Mastra at 89% accuracy", |
| 123 | + xy=(89.20, 10_200), xytext=(0.35, 0.80), |
| 124 | + textcoords="axes fraction", |
| 125 | + fontsize=10, color="#4ab8ff", fontweight="bold", |
| 126 | + ha="center", |
| 127 | + arrowprops=dict(arrowstyle="->", color="#4ab8ff", lw=1.5, |
| 128 | + connectionstyle="arc3,rad=-0.2")) |
| 129 | + |
| 130 | +# Footer note |
| 131 | +fig.text(0.5, 0.025, |
| 132 | + "AXME tokens measured from 500-question run. Competitor tokens estimated from published methodology " |
| 133 | + "(Observer/Reflector calls, fact extraction, graph construction). Model-agnostic — pricing changes, " |
| 134 | + "tokens don't.", |
| 135 | + ha="center", color="#888", fontsize=8, style="italic", wrap=True) |
| 136 | + |
| 137 | +plt.tight_layout(rect=[0, 0.05, 1, 1]) |
| 138 | + |
| 139 | +plt.savefig("token-performance.svg", format="svg", |
| 140 | + facecolor="#1a1a1a", bbox_inches="tight", dpi=150) |
| 141 | +plt.savefig("token-performance.png", format="png", |
| 142 | + facecolor="#1a1a1a", bbox_inches="tight", dpi=200) |
| 143 | + |
| 144 | +# Print table |
| 145 | +print(f"\n{'System':<14} {'Model':<14} {'tok/Q':>10} {'Accuracy':>10} {'tok/correct':>14}") |
| 146 | +print("─" * 70) |
| 147 | +for label, tpq, acc, model, _, is_axme, measured in systems: |
| 148 | + tpc = tokens_per_correct(tpq, acc) |
| 149 | + marker = " ✓" if measured else "" |
| 150 | + tpq_str = f"{tpq/1000:.0f}K" if tpq >= 1000 else str(tpq) |
| 151 | + tpc_str = f"{tpc/1000:.0f}K" if tpc >= 1000 else f"{tpc:.0f}" |
| 152 | + print(f"{label:<14} {model:<14} {tpq_str:>10} {acc:>9.2f}% {tpc_str:>14}{marker}") |
| 153 | +print(f"\n✓ = measured; others estimated from published methodology\n") |
0 commit comments