Skip to content

Commit 1542ac0

Browse files
committed
scripts/csv_tldr: one-line Slack/standup summariser for bench CSVs
The full fep-report flow expands a bench CSV into a multi-section markdown that's correct for a project journal but overkill for "how's the run going?" on Slack. csv_tldr prints exactly one line, with an honest exit code CI can gate on. Auto-detects hydration (mixed-sign ΔG_expt) vs binding (all negative binders) and picks the matching gate (1.5 / 2.0 kcal/mol). Recognises streptavidin vs EGFR bench from compound names so the line reads "streptavidin binding FEP" or "EGFR kinase binding FEP" instead of a generic label. Sign rules pinned: - hydration: every compound sign-correct (sign-flip → FAIL) - binding: every predicted ΔG < 0 (non-binder → FAIL) Exit codes: 0 PASS, 1 FAIL, 2 inconclusive (partial run), 3 I/O. 9/9 regression covers: hydration PASS/FAIL via bundled fixtures, streptavidin/EGFR label selection, non-binder flagging, partial run → inconclusive, missing file exit 3, gate override doesn't silently mask sign rule, single-line-output invariant. Closes the 4-item robustness sequence from the hourly cron.
1 parent 1fea828 commit 1542ac0

3 files changed

Lines changed: 434 additions & 0 deletions

File tree

.github/workflows/smoke.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,9 @@ jobs:
196196
- name: fill_prof_email auto-fill regression (9/9)
197197
run: python -u tests/fep/test_fill_prof_email_smoke.py
198198

199+
- name: csv_tldr one-line Slack summariser (9/9)
200+
run: python -u tests/fep/test_csv_tldr_smoke.py
201+
199202
- name: fep sampled binding smoke (opt-in, ~10 min, manual)
200203
if: >
201204
github.event_name == 'workflow_dispatch' &&

scripts/csv_tldr.py

Lines changed: 229 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
#!/usr/bin/env python3
2+
"""csv_tldr.py — compress a bench CSV into a single Slack/standup line.
3+
4+
Biologists running `cellsim fep-binding bench` or `cellsim fep-freesolv`
5+
end up with a CSV that the full fep-report flow expands into a multi-
6+
section markdown. For a quick status update ("how's the run going?")
7+
the full report is overkill — what's wanted is a one-liner you can
8+
paste into Slack or an email subject.
9+
10+
Usage:
11+
python scripts/csv_tldr.py run/fep/.../freesolv_results.csv
12+
python scripts/csv_tldr.py path/to/bench_results.csv --gate 2.0
13+
14+
The script auto-detects hydration vs binding from the ΔG sign
15+
pattern (hydration has mixed signs, binding has all negatives for
16+
binders), picks the appropriate gate (1.5 kcal/mol hydration,
17+
2.0 kcal/mol binding), and emits:
18+
19+
FreeSolv hydration FEP: 12/12 ok, MAE 0.42 kcal/mol (gate ≤1.5),
20+
Pearson r +0.993 — PASS
21+
22+
Exit codes:
23+
0 = PASS (all gates met)
24+
1 = FAIL (one or more gate failed)
25+
2 = inconclusive (missing data; partial run)
26+
3 = usage / I/O error
27+
"""
28+
from __future__ import annotations
29+
30+
import argparse
31+
import csv
32+
import math
33+
import sys
34+
from pathlib import Path
35+
36+
37+
def _load_rows(csv_path: Path) -> list[dict]:
38+
if not csv_path.is_file():
39+
raise FileNotFoundError(f"no such file: {csv_path}")
40+
with csv_path.open("r", encoding="utf-8-sig", newline="") as fi:
41+
reader = csv.DictReader(fi)
42+
return list(reader)
43+
44+
45+
def _f(v):
46+
if v in (None, "", "None", "nan"):
47+
return None
48+
try:
49+
return float(v)
50+
except (TypeError, ValueError):
51+
return None
52+
53+
54+
def _pearson(x: list[float], y: list[float]) -> float | None:
55+
if len(x) < 3 or len(x) != len(y):
56+
return None
57+
mx = sum(x) / len(x)
58+
my = sum(y) / len(y)
59+
num = sum((xi - mx) * (yi - my) for xi, yi in zip(x, y))
60+
dx = math.sqrt(sum((xi - mx) ** 2 for xi in x))
61+
dy = math.sqrt(sum((yi - my) ** 2 for yi in y))
62+
if dx == 0 or dy == 0:
63+
return None
64+
return num / (dx * dy)
65+
66+
67+
def _kendall_tau(x: list[float], y: list[float]) -> float | None:
68+
"""Kendall τ-a — O(n²), fine for <100 compounds."""
69+
n = len(x)
70+
if n < 3 or n != len(y):
71+
return None
72+
conc = disc = 0
73+
for i in range(n):
74+
for j in range(i + 1, n):
75+
dx = x[i] - x[j]
76+
dy = y[i] - y[j]
77+
if dx * dy > 0:
78+
conc += 1
79+
elif dx * dy < 0:
80+
disc += 1
81+
denom = n * (n - 1) / 2.0
82+
if denom == 0:
83+
return None
84+
return (conc - disc) / denom
85+
86+
87+
def detect_kind(rows: list[dict]) -> str:
88+
"""Hydration (FreeSolv-style): ΔG_expt spans positive + negative.
89+
Binding: all expt ΔG < 0 (binders only)."""
90+
expts = [_f(r.get("dG_expt_kcalmol")) for r in rows]
91+
expts = [e for e in expts if e is not None]
92+
if not expts:
93+
return "unknown"
94+
if any(e > 0 for e in expts) and any(e < 0 for e in expts):
95+
return "hydration"
96+
if all(e < 0 for e in expts):
97+
return "binding"
98+
return "unknown"
99+
100+
101+
def _label_for_kind(kind: str, rows: list[dict]) -> str:
102+
names = [(r.get("name") or "").lower() for r in rows]
103+
joined = " ".join(names)
104+
if kind == "hydration":
105+
return "FreeSolv hydration FEP"
106+
if kind == "binding":
107+
if "biotin" in joined or "streptavidin" in joined:
108+
return "streptavidin binding FEP"
109+
if ("erlotinib" in joined or "gefitinib" in joined
110+
or "lapatinib" in joined or "egfr" in joined):
111+
return "EGFR kinase binding FEP"
112+
return "binding FEP"
113+
return "FEP"
114+
115+
116+
def summarise(rows: list[dict], gate: float | None = None) -> dict:
117+
"""Return the TL;DR bundle: label, counts, metrics, verdict."""
118+
n_total = len(rows)
119+
ok_rows = [r for r in rows
120+
if (r.get("ok") or "").strip().lower() in (
121+
"true", "1", "yes")
122+
and _f(r.get("dG_pred_kcalmol")) is not None]
123+
n_ok = len(ok_rows)
124+
kind = detect_kind(rows)
125+
label = _label_for_kind(kind, rows)
126+
gate_kcal = gate
127+
if gate_kcal is None:
128+
gate_kcal = 1.5 if kind == "hydration" else 2.0
129+
130+
if not ok_rows:
131+
return {
132+
"label": label, "kind": kind,
133+
"n_total": n_total, "n_ok": n_ok,
134+
"mae": None, "pearson": None, "kendall": None,
135+
"sign_ok": None, "gate": gate_kcal,
136+
"verdict": "inconclusive",
137+
"reason": "no ok rows",
138+
}
139+
140+
expts = [_f(r["dG_expt_kcalmol"]) for r in ok_rows]
141+
preds = [_f(r["dG_pred_kcalmol"]) for r in ok_rows]
142+
abs_res = [abs(p - e) for p, e in zip(preds, expts)]
143+
mae = sum(abs_res) / len(abs_res)
144+
pearson = _pearson(expts, preds)
145+
kendall = _kendall_tau(expts, preds)
146+
147+
# Sign rule.
148+
sign_ok: bool | None = None
149+
sign_reason = ""
150+
if kind == "hydration":
151+
# Require sign-correct on every compound.
152+
sign_ok = all(
153+
(e > 0 and p > 0) or (e < 0 and p < 0)
154+
or (abs(e) < 0.5 and abs(p) < 0.5)
155+
for e, p in zip(expts, preds))
156+
if not sign_ok:
157+
sign_reason = "sign flip on ≥1 compound"
158+
elif kind == "binding":
159+
sign_ok = all(p < 0 for p in preds)
160+
if not sign_ok:
161+
sign_reason = "non-binder predicted"
162+
163+
mae_ok = mae <= gate_kcal
164+
complete = (n_ok == n_total)
165+
166+
if not complete:
167+
verdict = "inconclusive"
168+
elif mae_ok and (sign_ok is not False):
169+
verdict = "PASS"
170+
else:
171+
verdict = "FAIL"
172+
173+
return {
174+
"label": label, "kind": kind,
175+
"n_total": n_total, "n_ok": n_ok,
176+
"mae": mae, "pearson": pearson, "kendall": kendall,
177+
"sign_ok": sign_ok, "gate": gate_kcal,
178+
"verdict": verdict,
179+
"reason": sign_reason if not sign_ok else "",
180+
}
181+
182+
183+
def format_tldr(s: dict) -> str:
184+
n_ok = s["n_ok"]
185+
n_total = s["n_total"]
186+
label = s["label"]
187+
if s["mae"] is None:
188+
return (f"{label}: {n_ok}/{n_total} ok — "
189+
f"{s['verdict']} ({s['reason'] or 'no data'})")
190+
mae_str = f"MAE {s['mae']:.2f} kcal/mol (gate ≤{s['gate']:.1f})"
191+
corr = ""
192+
if s["kind"] == "hydration" and s["pearson"] is not None:
193+
corr = f", Pearson r {s['pearson']:+.3f}"
194+
elif s["kind"] == "binding" and s["kendall"] is not None:
195+
corr = f", Kendall τ {s['kendall']:+.2f}"
196+
tail = ""
197+
if s["reason"]:
198+
tail = f" ({s['reason']})"
199+
return (f"{label}: {n_ok}/{n_total} ok, {mae_str}"
200+
f"{corr}{s['verdict']}{tail}")
201+
202+
203+
def main(argv: list[str] | None = None) -> int:
204+
ap = argparse.ArgumentParser(description=__doc__)
205+
ap.add_argument("csv_path",
206+
help="bench CSV (from `cellsim fep-binding bench` "
207+
"or `cellsim fep-freesolv`)")
208+
ap.add_argument("--gate", type=float, default=None,
209+
help="MAE gate in kcal/mol (default: 1.5 for "
210+
"hydration, 2.0 for binding)")
211+
args = ap.parse_args(argv)
212+
213+
try:
214+
rows = _load_rows(Path(args.csv_path))
215+
except (FileNotFoundError, OSError) as e:
216+
print(f"csv_tldr: {e}", file=sys.stderr)
217+
return 3
218+
219+
s = summarise(rows, gate=args.gate)
220+
print(format_tldr(s))
221+
if s["verdict"] == "PASS":
222+
return 0
223+
if s["verdict"] == "FAIL":
224+
return 1
225+
return 2
226+
227+
228+
if __name__ == "__main__":
229+
sys.exit(main())

0 commit comments

Comments
 (0)