Skip to content

Commit a03be86

Browse files
dzmitrys-devclaude
andcommitted
release: v0.1.2 — project-tunable regress baselines + config goldens_path
Unblocks brownfield projects (e.g. SoftChat Plan 80.6-14) where the bundled Phase 80.1 thresholds — calibrated against the supamem-internal corpus — don't fit the project's corpus size. Adds three [supamem.eval] config keys plus matching env-var overrides: baseline_recall_at_5 / SUPAMEM_BASELINE_RECALL_AT_5 baseline_total_tokens / SUPAMEM_BASELINE_TOTAL_TOKENS baseline_p95_latency_ms / SUPAMEM_BASELINE_P95_LATENCY_MS Wires cfg.goldens_path as fallback when --goldens flag is omitted (was previously dead config). Defaults preserved — callers without overrides see identical behavior. 188/188 tests green (8 new); ruff clean; twine check PASSED. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 4da404c commit a03be86

6 files changed

Lines changed: 244 additions & 11 deletions

File tree

CHANGELOG.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,29 @@
22

33
All notable changes to `supamem` will be documented in this file.
44

5+
## v0.1.2 — 2026-04-29
6+
7+
Project-tunable regress baselines and config-resolved goldens path. Unblocks
8+
brownfield projects (e.g. SoftChat, Plan 80.6-14) where the bundled Phase
9+
80.1 thresholds — calibrated against the supamem-internal corpus — don't fit
10+
the project's corpus size.
11+
12+
### Added
13+
14+
- `[supamem.eval]` config block accepts `baseline_recall_at_5`,
15+
`baseline_total_tokens`, `baseline_p95_latency_ms` to override the bundled
16+
D-19 defaults per project.
17+
- Env-var overrides (highest precedence): `SUPAMEM_BASELINE_RECALL_AT_5`,
18+
`SUPAMEM_BASELINE_TOTAL_TOKENS`, `SUPAMEM_BASELINE_P95_LATENCY_MS`.
19+
- `cfg.goldens_path` now used as fallback when `--goldens` flag is omitted —
20+
previously the config field existed but was ignored by the eval runner.
21+
22+
### Fixed
23+
24+
- `supamem eval --regress` no longer fails projects with healthy retrieval
25+
but corpus sizes outside Phase 80.1's calibration window. Default behavior
26+
is unchanged for callers that don't set overrides.
27+
528
## v0.1.1 — 2026-04-29
629

730
First PyPI release. Hardens v0.1.0 with CI fixes, agent guides, an update-check

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
44

55
[project]
66
name = "supamem"
7-
version = "0.1.1"
7+
version = "0.1.2"
88
description = "Project-agnostic dual-memory tooling for Claude Code, Cursor, and opencode"
99
readme = "README.md"
1010
license = "MIT"

src/supamem/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
"""supamem — project-agnostic dual-memory tooling."""
2-
__version__ = "0.1.1"
2+
__version__ = "0.1.2"

src/supamem/config.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,11 @@ class ResolvedConfig:
4747
goldens_path: str = ""
4848
cache_dir: str = ""
4949
allow_legacy_collection: bool = False
50+
# Regress baselines — Phase 80.1 D-19 defaults; project-tunable for
51+
# corpora outside the supamem-internal calibration set (added v0.1.2).
52+
regress_baseline_recall_at_5: float = 0.60
53+
regress_baseline_total_tokens: int = 4000
54+
regress_baseline_p95_latency_ms: int = 500
5055

5156

5257
@dataclass
@@ -62,6 +67,9 @@ class ConfigChain:
6267
drop_tokens: Source = "default"
6368
goldens_path: Source = "default"
6469
cache_dir: Source = "default"
70+
regress_baseline_recall_at_5: Source = "default"
71+
regress_baseline_total_tokens: Source = "default"
72+
regress_baseline_p95_latency_ms: Source = "default"
6573

6674

6775
_LEGACY_ENV: dict[str, str] = {
@@ -73,7 +81,15 @@ class ConfigChain:
7381

7482
_NESTED_TABLES: list[tuple[str, dict[str, str]]] = [
7583
("hook", {"drop_tokens": "drop_tokens"}),
76-
("eval", {"goldens_path": "goldens_path"}),
84+
(
85+
"eval",
86+
{
87+
"goldens_path": "goldens_path",
88+
"baseline_recall_at_5": "regress_baseline_recall_at_5",
89+
"baseline_total_tokens": "regress_baseline_total_tokens",
90+
"baseline_p95_latency_ms": "regress_baseline_p95_latency_ms",
91+
},
92+
),
7793
("cache", {"cache_dir": "cache_dir"}),
7894
]
7995

src/supamem/eval/runner.py

Lines changed: 42 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
import json
1313
import logging
14+
import os
1415
import time
1516
from importlib import resources
1617
from pathlib import Path
@@ -22,7 +23,9 @@
2223

2324
log = logging.getLogger("supamem.eval.runner")
2425

25-
# Phase 80.1 locked thresholds (D-19).
26+
# Phase 80.1 locked thresholds (D-19) — defaults; project-tunable since v0.1.2
27+
# via [supamem.eval] baseline_* keys in .supamem/config.toml or env vars
28+
# SUPAMEM_BASELINE_{RECALL_AT_5,TOTAL_TOKENS,P95_LATENCY_MS}.
2629
BASELINE = {
2730
"mean_recall_at_5": 0.60,
2831
"total_tokens": 4000,
@@ -32,6 +35,34 @@
3235
BUNDLED_GOLDENS = "phase_80_1_tuned_hybrid.jsonl"
3336

3437

38+
def _resolve_baseline(cfg: ResolvedConfig) -> dict[str, float]:
39+
"""Merge BASELINE defaults ← config ← env-var overrides.
40+
41+
Env vars (highest precedence): ``SUPAMEM_BASELINE_RECALL_AT_5``,
42+
``SUPAMEM_BASELINE_TOTAL_TOKENS``, ``SUPAMEM_BASELINE_P95_LATENCY_MS``.
43+
Malformed values are logged and fall back to the config value.
44+
"""
45+
out = {
46+
"mean_recall_at_5": float(cfg.regress_baseline_recall_at_5),
47+
"total_tokens": int(cfg.regress_baseline_total_tokens),
48+
"p95_latency_ms": float(cfg.regress_baseline_p95_latency_ms),
49+
}
50+
overrides = (
51+
("SUPAMEM_BASELINE_RECALL_AT_5", "mean_recall_at_5", float),
52+
("SUPAMEM_BASELINE_TOTAL_TOKENS", "total_tokens", int),
53+
("SUPAMEM_BASELINE_P95_LATENCY_MS", "p95_latency_ms", float),
54+
)
55+
for env_var, key, caster in overrides:
56+
raw = os.environ.get(env_var, "").strip()
57+
if not raw:
58+
continue
59+
try:
60+
out[key] = caster(raw)
61+
except ValueError:
62+
log.warning("supamem eval: ignoring malformed %s=%r", env_var, raw)
63+
return out
64+
65+
3566
def _load_goldens(path: str | None) -> list[dict[str, Any]]:
3667
"""Load JSONL records from ``path`` or the bundled corpus."""
3768
if path:
@@ -79,8 +110,10 @@ def run_bench(
79110
) -> int:
80111
"""Run the bench. Returns 0 on pass, 1 on regression / fatal."""
81112
cfg = config or ResolvedConfig()
113+
# CLI flag wins over config; both win over bundled goldens (path=None).
114+
resolved_goldens = goldens_path or (cfg.goldens_path or None)
82115
try:
83-
records = _load_goldens(goldens_path)
116+
records = _load_goldens(resolved_goldens)
84117
except (FileNotFoundError, OSError) as exc:
85118
log.error("supamem eval: failed to load goldens: %s", exc)
86119
return 1
@@ -130,18 +163,19 @@ def run_bench(
130163
if not regress:
131164
return 0
132165

166+
baseline = _resolve_baseline(cfg)
133167
breaches: list[str] = []
134-
if mean_recall < BASELINE["mean_recall_at_5"]:
168+
if mean_recall < baseline["mean_recall_at_5"]:
135169
breaches.append(
136-
f"mean_recall_at_5={mean_recall:.4f} < baseline {BASELINE['mean_recall_at_5']}"
170+
f"mean_recall_at_5={mean_recall:.4f} < baseline {baseline['mean_recall_at_5']}"
137171
)
138-
if total_tokens > BASELINE["total_tokens"]:
172+
if total_tokens > baseline["total_tokens"]:
139173
breaches.append(
140-
f"total_tokens={total_tokens} > baseline {BASELINE['total_tokens']}"
174+
f"total_tokens={total_tokens} > baseline {baseline['total_tokens']}"
141175
)
142-
if p95 > BASELINE["p95_latency_ms"]:
176+
if p95 > baseline["p95_latency_ms"]:
143177
breaches.append(
144-
f"p95_latency_ms={p95:.2f} > baseline {BASELINE['p95_latency_ms']}"
178+
f"p95_latency_ms={p95:.2f} > baseline {baseline['p95_latency_ms']}"
145179
)
146180

147181
if breaches:

tests/test_eval_runner.py

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,3 +134,163 @@ def test_derive_required_substrings_is_deterministic() -> None:
134134

135135
def test_bundled_goldens_constant() -> None:
136136
assert BUNDLED_GOLDENS.endswith(".jsonl")
137+
138+
139+
# ── v0.1.2: project-tunable regress baselines ────────────────────────────────
140+
141+
142+
def test_resolve_baseline_uses_config_defaults() -> None:
143+
from supamem.eval.runner import _resolve_baseline
144+
145+
cfg = ResolvedConfig()
146+
out = _resolve_baseline(cfg)
147+
assert out["mean_recall_at_5"] == 0.60
148+
assert out["total_tokens"] == 4000
149+
assert out["p95_latency_ms"] == 500
150+
151+
152+
def test_resolve_baseline_config_override() -> None:
153+
from supamem.eval.runner import _resolve_baseline
154+
155+
cfg = ResolvedConfig(
156+
regress_baseline_recall_at_5=0.5,
157+
regress_baseline_total_tokens=20000,
158+
regress_baseline_p95_latency_ms=1000,
159+
)
160+
out = _resolve_baseline(cfg)
161+
assert out["mean_recall_at_5"] == 0.5
162+
assert out["total_tokens"] == 20000
163+
assert out["p95_latency_ms"] == 1000
164+
165+
166+
def test_resolve_baseline_env_override_beats_config(monkeypatch: pytest.MonkeyPatch) -> None:
167+
from supamem.eval.runner import _resolve_baseline
168+
169+
cfg = ResolvedConfig(regress_baseline_total_tokens=10000)
170+
monkeypatch.setenv("SUPAMEM_BASELINE_TOTAL_TOKENS", "25000")
171+
monkeypatch.setenv("SUPAMEM_BASELINE_RECALL_AT_5", "0.40")
172+
out = _resolve_baseline(cfg)
173+
assert out["total_tokens"] == 25000
174+
assert out["mean_recall_at_5"] == 0.40
175+
176+
177+
def test_resolve_baseline_malformed_env_falls_back(
178+
monkeypatch: pytest.MonkeyPatch,
179+
) -> None:
180+
from supamem.eval.runner import _resolve_baseline
181+
182+
monkeypatch.setenv("SUPAMEM_BASELINE_TOTAL_TOKENS", "not-a-number")
183+
out = _resolve_baseline(ResolvedConfig())
184+
assert out["total_tokens"] == 4000 # config default preserved
185+
186+
187+
def test_run_bench_regress_uses_overridden_baseline(
188+
monkeypatch: pytest.MonkeyPatch,
189+
tmp_path: Path,
190+
capsys: pytest.CaptureFixture[str],
191+
) -> None:
192+
"""Custom baseline: token usage that would breach default passes a higher cap."""
193+
p = tmp_path / "g.jsonl"
194+
p.write_text(
195+
json.dumps({"id": "c1", "query": "x", "required_substrings": ["chunk"]}) + "\n",
196+
encoding="utf-8",
197+
)
198+
199+
big_chunk = "chunk " + ("a" * 20_000) # 5000+ tokens single hit
200+
fake = MagicMock()
201+
fake.query.return_value = [_hit(big_chunk)]
202+
203+
import supamem.eval.runner as mod
204+
205+
monkeypatch.setattr(mod, "_build_backend", lambda cfg: fake)
206+
207+
# Default baseline (4000 tokens) would breach
208+
rc_default = run_bench(regress=True, goldens_path=str(p), config=_cfg())
209+
assert rc_default == 1
210+
assert "REGRESSION" in capsys.readouterr().out
211+
212+
# Project-tunable baseline raises the cap → passes
213+
cfg_high = _cfg(regress_baseline_total_tokens=100_000)
214+
rc_override = run_bench(regress=True, goldens_path=str(p), config=cfg_high)
215+
assert rc_override == 0
216+
assert "regress: PASS" in capsys.readouterr().out
217+
218+
219+
def test_run_bench_uses_config_goldens_path_when_flag_omitted(
220+
monkeypatch: pytest.MonkeyPatch, tmp_path: Path
221+
) -> None:
222+
"""cfg.goldens_path is used as fallback when --goldens flag not passed (v0.1.2)."""
223+
p = tmp_path / "g.jsonl"
224+
p.write_text(
225+
json.dumps({"id": "c1", "query": "hello", "required_substrings": ["world"]}) + "\n",
226+
encoding="utf-8",
227+
)
228+
229+
fake = MagicMock()
230+
fake.query.return_value = [_hit("hello world")]
231+
232+
import supamem.eval.runner as mod
233+
234+
monkeypatch.setattr(mod, "_build_backend", lambda cfg: fake)
235+
236+
cfg = _cfg(goldens_path=str(p))
237+
rc = run_bench(regress=False, goldens_path=None, config=cfg)
238+
assert rc == 0
239+
fake.query.assert_called_once()
240+
241+
242+
def test_run_bench_cli_flag_beats_config_goldens_path(
243+
monkeypatch: pytest.MonkeyPatch, tmp_path: Path
244+
) -> None:
245+
"""Explicit --goldens still wins over cfg.goldens_path."""
246+
cfg_path = tmp_path / "from-config.jsonl"
247+
cfg_path.write_text(
248+
json.dumps({"id": "from-config", "query": "ignored", "required_substrings": ["x"]})
249+
+ "\n",
250+
encoding="utf-8",
251+
)
252+
cli_path = tmp_path / "from-flag.jsonl"
253+
cli_path.write_text(
254+
json.dumps({"id": "from-flag", "query": "actually-used", "required_substrings": ["x"]})
255+
+ "\n",
256+
encoding="utf-8",
257+
)
258+
259+
seen_queries: list[str] = []
260+
261+
def query(q: str, **_: Any) -> list[Any]:
262+
seen_queries.append(q)
263+
return [_hit("x")]
264+
265+
fake = MagicMock()
266+
fake.query.side_effect = query
267+
268+
import supamem.eval.runner as mod
269+
270+
monkeypatch.setattr(mod, "_build_backend", lambda cfg: fake)
271+
272+
cfg = _cfg(goldens_path=str(cfg_path))
273+
run_bench(regress=False, goldens_path=str(cli_path), config=cfg)
274+
assert seen_queries == ["actually-used"]
275+
276+
277+
def test_eval_nested_table_loads_baseline_overrides(
278+
tmp_path: Path, monkeypatch: pytest.MonkeyPatch
279+
) -> None:
280+
"""[supamem.eval] baseline_* keys land in ResolvedConfig fields."""
281+
from supamem.config import load_config
282+
283+
(tmp_path / ".supamem").mkdir()
284+
(tmp_path / ".supamem" / "config.toml").write_text(
285+
'[supamem]\ncollection = "x"\n[supamem.eval]\n'
286+
'baseline_recall_at_5 = 0.55\n'
287+
'baseline_total_tokens = 18000\n'
288+
'baseline_p95_latency_ms = 750\n',
289+
encoding="utf-8",
290+
)
291+
monkeypatch.delenv("SUPAMEM_CONFIG", raising=False)
292+
cfg, chain = load_config(cwd=tmp_path)
293+
assert cfg.regress_baseline_recall_at_5 == 0.55
294+
assert cfg.regress_baseline_total_tokens == 18000
295+
assert cfg.regress_baseline_p95_latency_ms == 750
296+
assert chain.regress_baseline_recall_at_5 == "supamem_toml"

0 commit comments

Comments
 (0)