@@ -134,3 +134,163 @@ def test_derive_required_substrings_is_deterministic() -> None:
134134
135135def test_bundled_goldens_constant () -> None :
136136 assert BUNDLED_GOLDENS .endswith (".jsonl" )
137+
138+
139+ # ── v0.1.2: project-tunable regress baselines ────────────────────────────────
140+
141+
142+ def test_resolve_baseline_uses_config_defaults () -> None :
143+ from supamem .eval .runner import _resolve_baseline
144+
145+ cfg = ResolvedConfig ()
146+ out = _resolve_baseline (cfg )
147+ assert out ["mean_recall_at_5" ] == 0.60
148+ assert out ["total_tokens" ] == 4000
149+ assert out ["p95_latency_ms" ] == 500
150+
151+
152+ def test_resolve_baseline_config_override () -> None :
153+ from supamem .eval .runner import _resolve_baseline
154+
155+ cfg = ResolvedConfig (
156+ regress_baseline_recall_at_5 = 0.5 ,
157+ regress_baseline_total_tokens = 20000 ,
158+ regress_baseline_p95_latency_ms = 1000 ,
159+ )
160+ out = _resolve_baseline (cfg )
161+ assert out ["mean_recall_at_5" ] == 0.5
162+ assert out ["total_tokens" ] == 20000
163+ assert out ["p95_latency_ms" ] == 1000
164+
165+
166+ def test_resolve_baseline_env_override_beats_config (monkeypatch : pytest .MonkeyPatch ) -> None :
167+ from supamem .eval .runner import _resolve_baseline
168+
169+ cfg = ResolvedConfig (regress_baseline_total_tokens = 10000 )
170+ monkeypatch .setenv ("SUPAMEM_BASELINE_TOTAL_TOKENS" , "25000" )
171+ monkeypatch .setenv ("SUPAMEM_BASELINE_RECALL_AT_5" , "0.40" )
172+ out = _resolve_baseline (cfg )
173+ assert out ["total_tokens" ] == 25000
174+ assert out ["mean_recall_at_5" ] == 0.40
175+
176+
177+ def test_resolve_baseline_malformed_env_falls_back (
178+ monkeypatch : pytest .MonkeyPatch ,
179+ ) -> None :
180+ from supamem .eval .runner import _resolve_baseline
181+
182+ monkeypatch .setenv ("SUPAMEM_BASELINE_TOTAL_TOKENS" , "not-a-number" )
183+ out = _resolve_baseline (ResolvedConfig ())
184+ assert out ["total_tokens" ] == 4000 # config default preserved
185+
186+
187+ def test_run_bench_regress_uses_overridden_baseline (
188+ monkeypatch : pytest .MonkeyPatch ,
189+ tmp_path : Path ,
190+ capsys : pytest .CaptureFixture [str ],
191+ ) -> None :
192+ """Custom baseline: token usage that would breach default passes a higher cap."""
193+ p = tmp_path / "g.jsonl"
194+ p .write_text (
195+ json .dumps ({"id" : "c1" , "query" : "x" , "required_substrings" : ["chunk" ]}) + "\n " ,
196+ encoding = "utf-8" ,
197+ )
198+
199+ big_chunk = "chunk " + ("a" * 20_000 ) # 5000+ tokens single hit
200+ fake = MagicMock ()
201+ fake .query .return_value = [_hit (big_chunk )]
202+
203+ import supamem .eval .runner as mod
204+
205+ monkeypatch .setattr (mod , "_build_backend" , lambda cfg : fake )
206+
207+ # Default baseline (4000 tokens) would breach
208+ rc_default = run_bench (regress = True , goldens_path = str (p ), config = _cfg ())
209+ assert rc_default == 1
210+ assert "REGRESSION" in capsys .readouterr ().out
211+
212+ # Project-tunable baseline raises the cap → passes
213+ cfg_high = _cfg (regress_baseline_total_tokens = 100_000 )
214+ rc_override = run_bench (regress = True , goldens_path = str (p ), config = cfg_high )
215+ assert rc_override == 0
216+ assert "regress: PASS" in capsys .readouterr ().out
217+
218+
219+ def test_run_bench_uses_config_goldens_path_when_flag_omitted (
220+ monkeypatch : pytest .MonkeyPatch , tmp_path : Path
221+ ) -> None :
222+ """cfg.goldens_path is used as fallback when --goldens flag not passed (v0.1.2)."""
223+ p = tmp_path / "g.jsonl"
224+ p .write_text (
225+ json .dumps ({"id" : "c1" , "query" : "hello" , "required_substrings" : ["world" ]}) + "\n " ,
226+ encoding = "utf-8" ,
227+ )
228+
229+ fake = MagicMock ()
230+ fake .query .return_value = [_hit ("hello world" )]
231+
232+ import supamem .eval .runner as mod
233+
234+ monkeypatch .setattr (mod , "_build_backend" , lambda cfg : fake )
235+
236+ cfg = _cfg (goldens_path = str (p ))
237+ rc = run_bench (regress = False , goldens_path = None , config = cfg )
238+ assert rc == 0
239+ fake .query .assert_called_once ()
240+
241+
242+ def test_run_bench_cli_flag_beats_config_goldens_path (
243+ monkeypatch : pytest .MonkeyPatch , tmp_path : Path
244+ ) -> None :
245+ """Explicit --goldens still wins over cfg.goldens_path."""
246+ cfg_path = tmp_path / "from-config.jsonl"
247+ cfg_path .write_text (
248+ json .dumps ({"id" : "from-config" , "query" : "ignored" , "required_substrings" : ["x" ]})
249+ + "\n " ,
250+ encoding = "utf-8" ,
251+ )
252+ cli_path = tmp_path / "from-flag.jsonl"
253+ cli_path .write_text (
254+ json .dumps ({"id" : "from-flag" , "query" : "actually-used" , "required_substrings" : ["x" ]})
255+ + "\n " ,
256+ encoding = "utf-8" ,
257+ )
258+
259+ seen_queries : list [str ] = []
260+
261+ def query (q : str , ** _ : Any ) -> list [Any ]:
262+ seen_queries .append (q )
263+ return [_hit ("x" )]
264+
265+ fake = MagicMock ()
266+ fake .query .side_effect = query
267+
268+ import supamem .eval .runner as mod
269+
270+ monkeypatch .setattr (mod , "_build_backend" , lambda cfg : fake )
271+
272+ cfg = _cfg (goldens_path = str (cfg_path ))
273+ run_bench (regress = False , goldens_path = str (cli_path ), config = cfg )
274+ assert seen_queries == ["actually-used" ]
275+
276+
277+ def test_eval_nested_table_loads_baseline_overrides (
278+ tmp_path : Path , monkeypatch : pytest .MonkeyPatch
279+ ) -> None :
280+ """[supamem.eval] baseline_* keys land in ResolvedConfig fields."""
281+ from supamem .config import load_config
282+
283+ (tmp_path / ".supamem" ).mkdir ()
284+ (tmp_path / ".supamem" / "config.toml" ).write_text (
285+ '[supamem]\n collection = "x"\n [supamem.eval]\n '
286+ 'baseline_recall_at_5 = 0.55\n '
287+ 'baseline_total_tokens = 18000\n '
288+ 'baseline_p95_latency_ms = 750\n ' ,
289+ encoding = "utf-8" ,
290+ )
291+ monkeypatch .delenv ("SUPAMEM_CONFIG" , raising = False )
292+ cfg , chain = load_config (cwd = tmp_path )
293+ assert cfg .regress_baseline_recall_at_5 == 0.55
294+ assert cfg .regress_baseline_total_tokens == 18000
295+ assert cfg .regress_baseline_p95_latency_ms == 750
296+ assert chain .regress_baseline_recall_at_5 == "supamem_toml"
0 commit comments