|
11 | 11 | from pathlib import Path |
12 | 12 |
|
13 | 13 | from .db import state_dir |
14 | | -from .schema import Baseline, CandidateStatus, Db, ExperimentStatus |
| 14 | +from .schema import Baseline, CandidateStatus, Db, ExperimentStatus, ScenarioResult |
15 | 15 |
|
16 | 16 | METRICS_NAME = "metrics.md" |
| 17 | +F1_MATRIX_NAME = "f1-matrix.md" |
17 | 18 |
|
18 | 19 |
|
19 | 20 | def _path(root: Path) -> Path: |
@@ -104,6 +105,13 @@ def render(db: Db, root: Path = Path(".")) -> str: |
104 | 105 | ) |
105 | 106 | lines.append("") |
106 | 107 |
|
| 108 | + compact = _f1_matrix_compact(db) |
| 109 | + if compact: |
| 110 | + lines.append("## Current F1 matrix (vs baseline)") |
| 111 | + lines.extend(compact) |
| 112 | + lines.append(f"_Full per-scenario table: `.coordinator/{F1_MATRIX_NAME}`_") |
| 113 | + lines.append("") |
| 114 | + |
107 | 115 | # Harness meta |
108 | 116 | lines.append("## Harness") |
109 | 117 | hit, tot = _review_hit_rate(db) |
@@ -206,6 +214,109 @@ def render(db: Db, root: Path = Path(".")) -> str: |
206 | 214 | return "\n".join(lines) |
207 | 215 |
|
208 | 216 |
|
| 217 | +def _matrix_from_shipped(db: Db) -> dict[str, dict[str, ScenarioResult]]: |
| 218 | + """Most-recent shipped value per (detector, scenario). |
| 219 | +
|
| 220 | + Walks experiments in insertion order (chronological). Only experiments |
| 221 | + whose candidate is SHIPPED count — these correspond to commits that |
| 222 | + landed. Per-scenario keys are `<detector>/<scenario>`; we bucket by |
| 223 | + detector. |
| 224 | + """ |
| 225 | + out: dict[str, dict[str, ScenarioResult]] = {} |
| 226 | + for exp in db.experiments.values(): |
| 227 | + cand = db.candidates.get(exp.candidate_id) |
| 228 | + if not cand or cand.status != CandidateStatus.SHIPPED: |
| 229 | + continue |
| 230 | + for key, sr in exp.per_scenario.items(): |
| 231 | + if "/" not in key: |
| 232 | + continue |
| 233 | + detector, scenario = key.split("/", 1) |
| 234 | + out.setdefault(detector, {})[scenario] = sr |
| 235 | + return out |
| 236 | + |
| 237 | + |
| 238 | +def build_f1_matrix(db: Db) -> str: |
| 239 | + """Per-detector × per-scenario F1 matrix (baseline → current, Δ). |
| 240 | +
|
| 241 | + Caveat: values for a given detector come from the most-recent shipped |
| 242 | + experiment that touched that detector. If detector X hasn't shipped in |
| 243 | + a while, its row reflects that older state, not today's code — but the |
| 244 | + code for X hasn't changed since, so it's still accurate. |
| 245 | + """ |
| 246 | + lines: list[str] = ["# F1 matrix (per-detector × per-scenario)\n"] |
| 247 | + if not db.baseline: |
| 248 | + lines.append("_(no baseline)_\n") |
| 249 | + return "\n".join(lines) |
| 250 | + |
| 251 | + current = _matrix_from_shipped(db) |
| 252 | + train = db.split.as_train_set() if db.split else set() |
| 253 | + lockbox = db.split.as_lockbox_set() if db.split else set() |
| 254 | + |
| 255 | + lines.append(f"Baseline SHA: `{db.baseline.sha}` · Generated: {db.baseline.generated_at}") |
| 256 | + ship_count = sum(1 for c in db.candidates.values() if c.status == CandidateStatus.SHIPPED) |
| 257 | + lines.append(f"Shipped candidates reflected: {ship_count}\n") |
| 258 | + |
| 259 | + for det_name, det_base in db.baseline.detectors.items(): |
| 260 | + lines.append(f"## {det_name}") |
| 261 | + det_current = current.get(det_name, {}) |
| 262 | + if not det_current: |
| 263 | + lines.append("_(no shipped experiments have updated this detector; showing baseline only)_\n") |
| 264 | + # Order: train first, then lockbox, then any extras. |
| 265 | + all_scen = list(det_base.scenarios.keys()) |
| 266 | + ordered = ( |
| 267 | + [s for s in all_scen if s in train] |
| 268 | + + [s for s in all_scen if s in lockbox] |
| 269 | + + [s for s in all_scen if s not in train and s not in lockbox] |
| 270 | + ) |
| 271 | + lines.append("| Scenario | Split | Baseline F1 | Current F1 | ΔF1 | FPs base → cur |") |
| 272 | + lines.append("|---|---|---:|---:|---:|---:|") |
| 273 | + for scen in ordered: |
| 274 | + base_sr = det_base.scenarios[scen] |
| 275 | + cur_sr = det_current.get(scen) |
| 276 | + split_tag = "train" if scen in train else ("lockbox" if scen in lockbox else "other") |
| 277 | + if cur_sr is None: |
| 278 | + lines.append( |
| 279 | + f"| `{scen}` | {split_tag} | {base_sr.f1:.3f} | — | — | {base_sr.num_baseline_fps} → — |" |
| 280 | + ) |
| 281 | + else: |
| 282 | + df1 = cur_sr.f1 - base_sr.f1 |
| 283 | + lines.append( |
| 284 | + f"| `{scen}` | {split_tag} | {base_sr.f1:.3f} | {cur_sr.f1:.3f} " |
| 285 | + f"| {df1:+.3f} | {base_sr.num_baseline_fps} → {cur_sr.num_baseline_fps} |" |
| 286 | + ) |
| 287 | + # Aggregate |
| 288 | + cur_f1s = [cur_sr.f1 for cur_sr in det_current.values()] |
| 289 | + if cur_f1s: |
| 290 | + lines.append( |
| 291 | + f"\n**mean F1**: {det_base.mean_f1:.4f} → " |
| 292 | + f"{sum(cur_f1s) / len(cur_f1s):.4f} " |
| 293 | + f"(over {len(cur_f1s)}/{len(all_scen)} scenarios updated)" |
| 294 | + ) |
| 295 | + lines.append("") |
| 296 | + return "\n".join(lines) |
| 297 | + |
| 298 | + |
| 299 | +def _f1_matrix_compact(db: Db) -> list[str]: |
| 300 | + """One-line-per-detector summary suitable for embedding in metrics.md.""" |
| 301 | + if not db.baseline: |
| 302 | + return [] |
| 303 | + current = _matrix_from_shipped(db) |
| 304 | + out: list[str] = [] |
| 305 | + for det_name, det_base in db.baseline.detectors.items(): |
| 306 | + det_current = current.get(det_name, {}) |
| 307 | + cur_f1s = [sr.f1 for sr in det_current.values()] |
| 308 | + if not cur_f1s: |
| 309 | + out.append(f"- **{det_name}**: baseline mean F1 {det_base.mean_f1:.4f} (unchanged)") |
| 310 | + continue |
| 311 | + cur_mean = sum(cur_f1s) / len(cur_f1s) |
| 312 | + d = cur_mean - det_base.mean_f1 |
| 313 | + out.append( |
| 314 | + f"- **{det_name}**: {det_base.mean_f1:.4f} → {cur_mean:.4f} " |
| 315 | + f"(Δ{d:+.4f}, {len(cur_f1s)}/{len(det_base.scenarios)} scenarios updated)" |
| 316 | + ) |
| 317 | + return out |
| 318 | + |
| 319 | + |
209 | 320 | def _min_over(detector_baseline, attr: str, scope: set[str] | None) -> float: |
210 | 321 | """Min value of `attr` over scenarios in `scope` (or all if scope=None).""" |
211 | 322 | vals = [ |
@@ -237,3 +348,5 @@ def regenerate(db: Db, root: Path = Path(".")) -> None: |
237 | 348 | p = _path(root) |
238 | 349 | p.parent.mkdir(parents=True, exist_ok=True) |
239 | 350 | p.write_text(render(db, root)) |
| 351 | + matrix_path = state_dir(root) / F1_MATRIX_NAME |
| 352 | + matrix_path.write_text(build_f1_matrix(db)) |
0 commit comments