|
| 1 | +"""Run BLAST+ / DIAMOND (or load precomputed hits) into a homology hits table. |
| 2 | +
|
| 3 | +Each producer returns the bidirectional hits DataFrame (``HIT_COLUMNS``) consumed by |
| 4 | +:func:`~raven_python.reconstruction.homology.get_model_from_homology`. Binaries are |
| 5 | +located via :func:`raven_python.binaries.resolve_binary` (arg → env → PATH → bundled). |
| 6 | +""" |
| 7 | +from __future__ import annotations |
| 8 | + |
| 9 | +import io |
| 10 | +import subprocess |
| 11 | +import tempfile |
| 12 | +from collections.abc import Sequence |
| 13 | +from pathlib import Path |
| 14 | + |
| 15 | +import pandas as pd |
| 16 | + |
| 17 | +from raven_python.binaries import resolve_binary |
| 18 | +from raven_python.reconstruction.homology.hits import HIT_COLUMNS, validate_hits |
| 19 | + |
| 20 | +# Tabular output columns requested from BLAST+/DIAMOND, in order. |
| 21 | +_OUTFMT_FIELDS = ["qseqid", "sseqid", "evalue", "pident", "length", "bitscore", "ppos"] |
| 22 | +_FIELD_TO_HIT = { |
| 23 | + "qseqid": "from_gene", "sseqid": "to_gene", "evalue": "evalue", |
| 24 | + "pident": "identity", "length": "align_len", "bitscore": "bitscore", "ppos": "ppos", |
| 25 | +} |
| 26 | + |
| 27 | + |
| 28 | +def _parse_tabular(text: str, from_id: str, to_id: str, sep: str) -> pd.DataFrame: |
| 29 | + """Parse one BLAST/DIAMOND tabular output into hit rows for one direction.""" |
| 30 | + if not text.strip(): |
| 31 | + return pd.DataFrame(columns=HIT_COLUMNS) |
| 32 | + df = pd.read_csv(io.StringIO(text), sep=sep, names=_OUTFMT_FIELDS, dtype={0: str, 1: str}) |
| 33 | + df = df.rename(columns=_FIELD_TO_HIT) |
| 34 | + df["from_id"] = from_id |
| 35 | + df["to_id"] = to_id |
| 36 | + return df[HIT_COLUMNS] |
| 37 | + |
| 38 | + |
| 39 | +def _as_list(x): |
| 40 | + return [x] if isinstance(x, (str, Path)) else list(x) |
| 41 | + |
| 42 | + |
| 43 | +def _run(cmd: list[str]) -> str: |
| 44 | + proc = subprocess.run(cmd, capture_output=True, text=True) |
| 45 | + if proc.returncode != 0: |
| 46 | + raise RuntimeError(f"{cmd[0]} failed:\n{proc.stderr.strip()}") |
| 47 | + return proc.stdout |
| 48 | + |
| 49 | + |
| 50 | +def run_blast( |
| 51 | + organism_id: str, |
| 52 | + fasta: str | Path, |
| 53 | + model_ids: Sequence[str], |
| 54 | + ref_fastas: Sequence[str | Path], |
| 55 | + *, |
| 56 | + evalue: float = 1e-5, |
| 57 | + threads: int = 1, |
| 58 | + blastp: str | Path | None = None, |
| 59 | + makeblastdb: str | Path | None = None, |
| 60 | +) -> pd.DataFrame: |
| 61 | + """Bidirectional BLAST+ between an organism and template organisms. |
| 62 | +
|
| 63 | + Returns the hits DataFrame (filtered at |
| 64 | + ``evalue``). Requires BLAST+ (`blastp`, `makeblastdb`). |
| 65 | + """ |
| 66 | + model_ids = list(model_ids) |
| 67 | + ref_fastas = _as_list(ref_fastas) |
| 68 | + if len(model_ids) != len(ref_fastas): |
| 69 | + raise ValueError("model_ids and ref_fastas must have the same length.") |
| 70 | + blastp = resolve_binary("blastp", binary=blastp) |
| 71 | + makeblastdb = resolve_binary("makeblastdb", binary=makeblastdb) |
| 72 | + outfmt = "10 " + " ".join(_OUTFMT_FIELDS) # 10 = CSV |
| 73 | + |
| 74 | + frames = [] |
| 75 | + with tempfile.TemporaryDirectory() as tmp: |
| 76 | + tmp = Path(tmp) |
| 77 | + |
| 78 | + def blastp_dir(query, subject_fasta, from_id, to_id): |
| 79 | + db = tmp / f"db_{from_id}_{to_id}" |
| 80 | + _run([makeblastdb, "-in", str(subject_fasta), "-dbtype", "prot", "-out", str(db)]) |
| 81 | + out = _run([ |
| 82 | + blastp, "-query", str(query), "-db", str(db), "-evalue", str(evalue), |
| 83 | + "-outfmt", outfmt, "-num_threads", str(threads), |
| 84 | + ]) |
| 85 | + return _parse_tabular(out, from_id, to_id, sep=",") |
| 86 | + |
| 87 | + for model_id, ref in zip(model_ids, ref_fastas, strict=True): |
| 88 | + # template -> organism, and organism -> template |
| 89 | + frames.append(blastp_dir(ref, fasta, model_id, organism_id)) |
| 90 | + frames.append(blastp_dir(fasta, ref, organism_id, model_id)) |
| 91 | + return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame(columns=HIT_COLUMNS) |
| 92 | + |
| 93 | + |
| 94 | +def run_diamond( |
| 95 | + organism_id: str, |
| 96 | + fasta: str | Path, |
| 97 | + model_ids: Sequence[str], |
| 98 | + ref_fastas: Sequence[str | Path], |
| 99 | + *, |
| 100 | + evalue: float = 1e-5, |
| 101 | + threads: int = 1, |
| 102 | + sensitivity: str = "--more-sensitive", |
| 103 | + diamond: str | Path | None = None, |
| 104 | +) -> pd.DataFrame: |
| 105 | + """Bidirectional DIAMOND between an organism and template organisms. |
| 106 | +
|
| 107 | + Returns the hits DataFrame. Requires DIAMOND. |
| 108 | + """ |
| 109 | + model_ids = list(model_ids) |
| 110 | + ref_fastas = _as_list(ref_fastas) |
| 111 | + if len(model_ids) != len(ref_fastas): |
| 112 | + raise ValueError("model_ids and ref_fastas must have the same length.") |
| 113 | + diamond = resolve_binary("diamond", binary=diamond) |
| 114 | + |
| 115 | + frames = [] |
| 116 | + with tempfile.TemporaryDirectory() as tmp: |
| 117 | + tmp = Path(tmp) |
| 118 | + |
| 119 | + def diamond_dir(query, subject_fasta, from_id, to_id): |
| 120 | + db = tmp / f"db_{from_id}_{to_id}" |
| 121 | + _run([diamond, "makedb", "--in", str(subject_fasta), "--db", str(db)]) |
| 122 | + cmd = [diamond, "blastp", "--query", str(query), "--db", str(db), |
| 123 | + "--evalue", str(evalue), "--outfmt", "6", *_OUTFMT_FIELDS, |
| 124 | + "--threads", str(threads)] |
| 125 | + if sensitivity: |
| 126 | + cmd.append(sensitivity) |
| 127 | + return _parse_tabular(_run(cmd), from_id, to_id, sep="\t") |
| 128 | + |
| 129 | + for model_id, ref in zip(model_ids, ref_fastas, strict=True): |
| 130 | + frames.append(diamond_dir(ref, fasta, model_id, organism_id)) |
| 131 | + frames.append(diamond_dir(fasta, ref, organism_id, model_id)) |
| 132 | + return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame(columns=HIT_COLUMNS) |
| 133 | + |
| 134 | + |
| 135 | +def blast_from_table(source: str | Path | pd.DataFrame) -> pd.DataFrame: |
| 136 | + """Load a precomputed homology hits table (CSV path or DataFrame). |
| 137 | +
|
| 138 | + a plain CSV/DataFrame, not Excel. |
| 139 | + Must contain the ``HIT_COLUMNS`` columns. |
| 140 | + """ |
| 141 | + # Force gene-id columns to str: an all-numeric gene-id column (e.g. Entrez ids) |
| 142 | + # would otherwise be read as int64 and never match the string gene ids in a model. |
| 143 | + df = (source if isinstance(source, pd.DataFrame) |
| 144 | + else pd.read_csv(source, dtype={"from_gene": str, "to_gene": str})) |
| 145 | + validate_hits(df) |
| 146 | + return df[HIT_COLUMNS].copy() |
0 commit comments