|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""csv_tldr.py — compress a bench CSV into a single Slack/standup line. |
| 3 | +
|
| 4 | +Biologists running `cellsim fep-binding bench` or `cellsim fep-freesolv` |
| 5 | +end up with a CSV that the full fep-report flow expands into a multi- |
| 6 | +section markdown. For a quick status update ("how's the run going?") |
| 7 | +the full report is overkill — what's wanted is a one-liner you can |
| 8 | +paste into Slack or an email subject. |
| 9 | +
|
| 10 | +Usage: |
| 11 | + python scripts/csv_tldr.py run/fep/.../freesolv_results.csv |
| 12 | + python scripts/csv_tldr.py path/to/bench_results.csv --gate 2.0 |
| 13 | +
|
| 14 | +The script auto-detects hydration vs binding from the ΔG sign |
| 15 | +pattern (hydration has mixed signs, binding has all negatives for |
| 16 | +binders), picks the appropriate gate (1.5 kcal/mol hydration, |
| 17 | +2.0 kcal/mol binding), and emits: |
| 18 | +
|
| 19 | + FreeSolv hydration FEP: 12/12 ok, MAE 0.42 kcal/mol (gate ≤1.5), |
| 20 | + Pearson r +0.993 — PASS |
| 21 | +
|
| 22 | +Exit codes: |
| 23 | + 0 = PASS (all gates met) |
| 24 | + 1 = FAIL (one or more gate failed) |
| 25 | + 2 = inconclusive (missing data; partial run) |
| 26 | + 3 = usage / I/O error |
| 27 | +""" |
| 28 | +from __future__ import annotations |
| 29 | + |
| 30 | +import argparse |
| 31 | +import csv |
| 32 | +import math |
| 33 | +import sys |
| 34 | +from pathlib import Path |
| 35 | + |
| 36 | + |
| 37 | +def _load_rows(csv_path: Path) -> list[dict]: |
| 38 | + if not csv_path.is_file(): |
| 39 | + raise FileNotFoundError(f"no such file: {csv_path}") |
| 40 | + with csv_path.open("r", encoding="utf-8-sig", newline="") as fi: |
| 41 | + reader = csv.DictReader(fi) |
| 42 | + return list(reader) |
| 43 | + |
| 44 | + |
| 45 | +def _f(v): |
| 46 | + if v in (None, "", "None", "nan"): |
| 47 | + return None |
| 48 | + try: |
| 49 | + return float(v) |
| 50 | + except (TypeError, ValueError): |
| 51 | + return None |
| 52 | + |
| 53 | + |
| 54 | +def _pearson(x: list[float], y: list[float]) -> float | None: |
| 55 | + if len(x) < 3 or len(x) != len(y): |
| 56 | + return None |
| 57 | + mx = sum(x) / len(x) |
| 58 | + my = sum(y) / len(y) |
| 59 | + num = sum((xi - mx) * (yi - my) for xi, yi in zip(x, y)) |
| 60 | + dx = math.sqrt(sum((xi - mx) ** 2 for xi in x)) |
| 61 | + dy = math.sqrt(sum((yi - my) ** 2 for yi in y)) |
| 62 | + if dx == 0 or dy == 0: |
| 63 | + return None |
| 64 | + return num / (dx * dy) |
| 65 | + |
| 66 | + |
| 67 | +def _kendall_tau(x: list[float], y: list[float]) -> float | None: |
| 68 | + """Kendall τ-a — O(n²), fine for <100 compounds.""" |
| 69 | + n = len(x) |
| 70 | + if n < 3 or n != len(y): |
| 71 | + return None |
| 72 | + conc = disc = 0 |
| 73 | + for i in range(n): |
| 74 | + for j in range(i + 1, n): |
| 75 | + dx = x[i] - x[j] |
| 76 | + dy = y[i] - y[j] |
| 77 | + if dx * dy > 0: |
| 78 | + conc += 1 |
| 79 | + elif dx * dy < 0: |
| 80 | + disc += 1 |
| 81 | + denom = n * (n - 1) / 2.0 |
| 82 | + if denom == 0: |
| 83 | + return None |
| 84 | + return (conc - disc) / denom |
| 85 | + |
| 86 | + |
| 87 | +def detect_kind(rows: list[dict]) -> str: |
| 88 | + """Hydration (FreeSolv-style): ΔG_expt spans positive + negative. |
| 89 | + Binding: all expt ΔG < 0 (binders only).""" |
| 90 | + expts = [_f(r.get("dG_expt_kcalmol")) for r in rows] |
| 91 | + expts = [e for e in expts if e is not None] |
| 92 | + if not expts: |
| 93 | + return "unknown" |
| 94 | + if any(e > 0 for e in expts) and any(e < 0 for e in expts): |
| 95 | + return "hydration" |
| 96 | + if all(e < 0 for e in expts): |
| 97 | + return "binding" |
| 98 | + return "unknown" |
| 99 | + |
| 100 | + |
| 101 | +def _label_for_kind(kind: str, rows: list[dict]) -> str: |
| 102 | + names = [(r.get("name") or "").lower() for r in rows] |
| 103 | + joined = " ".join(names) |
| 104 | + if kind == "hydration": |
| 105 | + return "FreeSolv hydration FEP" |
| 106 | + if kind == "binding": |
| 107 | + if "biotin" in joined or "streptavidin" in joined: |
| 108 | + return "streptavidin binding FEP" |
| 109 | + if ("erlotinib" in joined or "gefitinib" in joined |
| 110 | + or "lapatinib" in joined or "egfr" in joined): |
| 111 | + return "EGFR kinase binding FEP" |
| 112 | + return "binding FEP" |
| 113 | + return "FEP" |
| 114 | + |
| 115 | + |
| 116 | +def summarise(rows: list[dict], gate: float | None = None) -> dict: |
| 117 | + """Return the TL;DR bundle: label, counts, metrics, verdict.""" |
| 118 | + n_total = len(rows) |
| 119 | + ok_rows = [r for r in rows |
| 120 | + if (r.get("ok") or "").strip().lower() in ( |
| 121 | + "true", "1", "yes") |
| 122 | + and _f(r.get("dG_pred_kcalmol")) is not None] |
| 123 | + n_ok = len(ok_rows) |
| 124 | + kind = detect_kind(rows) |
| 125 | + label = _label_for_kind(kind, rows) |
| 126 | + gate_kcal = gate |
| 127 | + if gate_kcal is None: |
| 128 | + gate_kcal = 1.5 if kind == "hydration" else 2.0 |
| 129 | + |
| 130 | + if not ok_rows: |
| 131 | + return { |
| 132 | + "label": label, "kind": kind, |
| 133 | + "n_total": n_total, "n_ok": n_ok, |
| 134 | + "mae": None, "pearson": None, "kendall": None, |
| 135 | + "sign_ok": None, "gate": gate_kcal, |
| 136 | + "verdict": "inconclusive", |
| 137 | + "reason": "no ok rows", |
| 138 | + } |
| 139 | + |
| 140 | + expts = [_f(r["dG_expt_kcalmol"]) for r in ok_rows] |
| 141 | + preds = [_f(r["dG_pred_kcalmol"]) for r in ok_rows] |
| 142 | + abs_res = [abs(p - e) for p, e in zip(preds, expts)] |
| 143 | + mae = sum(abs_res) / len(abs_res) |
| 144 | + pearson = _pearson(expts, preds) |
| 145 | + kendall = _kendall_tau(expts, preds) |
| 146 | + |
| 147 | + # Sign rule. |
| 148 | + sign_ok: bool | None = None |
| 149 | + sign_reason = "" |
| 150 | + if kind == "hydration": |
| 151 | + # Require sign-correct on every compound. |
| 152 | + sign_ok = all( |
| 153 | + (e > 0 and p > 0) or (e < 0 and p < 0) |
| 154 | + or (abs(e) < 0.5 and abs(p) < 0.5) |
| 155 | + for e, p in zip(expts, preds)) |
| 156 | + if not sign_ok: |
| 157 | + sign_reason = "sign flip on ≥1 compound" |
| 158 | + elif kind == "binding": |
| 159 | + sign_ok = all(p < 0 for p in preds) |
| 160 | + if not sign_ok: |
| 161 | + sign_reason = "non-binder predicted" |
| 162 | + |
| 163 | + mae_ok = mae <= gate_kcal |
| 164 | + complete = (n_ok == n_total) |
| 165 | + |
| 166 | + if not complete: |
| 167 | + verdict = "inconclusive" |
| 168 | + elif mae_ok and (sign_ok is not False): |
| 169 | + verdict = "PASS" |
| 170 | + else: |
| 171 | + verdict = "FAIL" |
| 172 | + |
| 173 | + return { |
| 174 | + "label": label, "kind": kind, |
| 175 | + "n_total": n_total, "n_ok": n_ok, |
| 176 | + "mae": mae, "pearson": pearson, "kendall": kendall, |
| 177 | + "sign_ok": sign_ok, "gate": gate_kcal, |
| 178 | + "verdict": verdict, |
| 179 | + "reason": sign_reason if not sign_ok else "", |
| 180 | + } |
| 181 | + |
| 182 | + |
| 183 | +def format_tldr(s: dict) -> str: |
| 184 | + n_ok = s["n_ok"] |
| 185 | + n_total = s["n_total"] |
| 186 | + label = s["label"] |
| 187 | + if s["mae"] is None: |
| 188 | + return (f"{label}: {n_ok}/{n_total} ok — " |
| 189 | + f"{s['verdict']} ({s['reason'] or 'no data'})") |
| 190 | + mae_str = f"MAE {s['mae']:.2f} kcal/mol (gate ≤{s['gate']:.1f})" |
| 191 | + corr = "" |
| 192 | + if s["kind"] == "hydration" and s["pearson"] is not None: |
| 193 | + corr = f", Pearson r {s['pearson']:+.3f}" |
| 194 | + elif s["kind"] == "binding" and s["kendall"] is not None: |
| 195 | + corr = f", Kendall τ {s['kendall']:+.2f}" |
| 196 | + tail = "" |
| 197 | + if s["reason"]: |
| 198 | + tail = f" ({s['reason']})" |
| 199 | + return (f"{label}: {n_ok}/{n_total} ok, {mae_str}" |
| 200 | + f"{corr} — {s['verdict']}{tail}") |
| 201 | + |
| 202 | + |
| 203 | +def main(argv: list[str] | None = None) -> int: |
| 204 | + ap = argparse.ArgumentParser(description=__doc__) |
| 205 | + ap.add_argument("csv_path", |
| 206 | + help="bench CSV (from `cellsim fep-binding bench` " |
| 207 | + "or `cellsim fep-freesolv`)") |
| 208 | + ap.add_argument("--gate", type=float, default=None, |
| 209 | + help="MAE gate in kcal/mol (default: 1.5 for " |
| 210 | + "hydration, 2.0 for binding)") |
| 211 | + args = ap.parse_args(argv) |
| 212 | + |
| 213 | + try: |
| 214 | + rows = _load_rows(Path(args.csv_path)) |
| 215 | + except (FileNotFoundError, OSError) as e: |
| 216 | + print(f"csv_tldr: {e}", file=sys.stderr) |
| 217 | + return 3 |
| 218 | + |
| 219 | + s = summarise(rows, gate=args.gate) |
| 220 | + print(format_tldr(s)) |
| 221 | + if s["verdict"] == "PASS": |
| 222 | + return 0 |
| 223 | + if s["verdict"] == "FAIL": |
| 224 | + return 1 |
| 225 | + return 2 |
| 226 | + |
| 227 | + |
| 228 | +if __name__ == "__main__": |
| 229 | + sys.exit(main()) |
0 commit comments