Skip to content

Commit 2c356c0

Browse files
authored
Reconstruction: BLAST/DIAMOND wrappers and homology-based drafting (#4)
* Project scaffold: pyproject + package skeleton + README + LICENSE * Add GitHub Actions CI and the maintainer-scripts README * Add the foundation utilities: GPR, balance, parse, sort, validate * Add the model-manipulation layer (add, remove, transport, merge, etc.) * Add binary + data resolvers for external tools and published artefacts * Add YAML and SIF model I/O * Add Excel export and the Standard-GEM git-layout export * Add BLAST and DIAMOND wrappers for protein-homology searches * Add the homology-based draft model builder (getModelFromHomology port)
1 parent 8a28f0d commit 2c356c0

7 files changed

Lines changed: 727 additions & 0 deletions

File tree

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
"""De novo reconstruction from KEGG and protein homology (BLAST/DIAMOND)."""
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
"""Homology-based reconstruction from template models (getModelFromHomology, BLAST/DIAMOND)."""
2+
from raven_python.reconstruction.homology.blast import (
3+
blast_from_table,
4+
run_blast,
5+
run_diamond,
6+
)
7+
from raven_python.reconstruction.homology.hits import HIT_COLUMNS, make_ortholog_hits, validate_hits
8+
from raven_python.reconstruction.homology.homology import HomologyResult, get_model_from_homology
9+
10+
__all__ = [
11+
"HIT_COLUMNS",
12+
"HomologyResult",
13+
"blast_from_table",
14+
"get_model_from_homology",
15+
"make_ortholog_hits",
16+
"run_blast",
17+
"run_diamond",
18+
"validate_hits",
19+
]
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
"""Run BLAST+ / DIAMOND (or load precomputed hits) into a homology hits table.
2+
3+
Each producer returns the bidirectional hits DataFrame (``HIT_COLUMNS``) consumed by
4+
:func:`~raven_python.reconstruction.homology.get_model_from_homology`. Binaries are
5+
located via :func:`raven_python.binaries.resolve_binary` (arg → env → PATH → bundled).
6+
"""
7+
from __future__ import annotations
8+
9+
import io
10+
import subprocess
11+
import tempfile
12+
from collections.abc import Sequence
13+
from pathlib import Path
14+
15+
import pandas as pd
16+
17+
from raven_python.binaries import resolve_binary
18+
from raven_python.reconstruction.homology.hits import HIT_COLUMNS, validate_hits
19+
20+
# Tabular output columns requested from BLAST+/DIAMOND, in order.
21+
_OUTFMT_FIELDS = ["qseqid", "sseqid", "evalue", "pident", "length", "bitscore", "ppos"]
22+
_FIELD_TO_HIT = {
23+
"qseqid": "from_gene", "sseqid": "to_gene", "evalue": "evalue",
24+
"pident": "identity", "length": "align_len", "bitscore": "bitscore", "ppos": "ppos",
25+
}
26+
27+
28+
def _parse_tabular(text: str, from_id: str, to_id: str, sep: str) -> pd.DataFrame:
29+
"""Parse one BLAST/DIAMOND tabular output into hit rows for one direction."""
30+
if not text.strip():
31+
return pd.DataFrame(columns=HIT_COLUMNS)
32+
df = pd.read_csv(io.StringIO(text), sep=sep, names=_OUTFMT_FIELDS, dtype={0: str, 1: str})
33+
df = df.rename(columns=_FIELD_TO_HIT)
34+
df["from_id"] = from_id
35+
df["to_id"] = to_id
36+
return df[HIT_COLUMNS]
37+
38+
39+
def _as_list(x):
40+
return [x] if isinstance(x, (str, Path)) else list(x)
41+
42+
43+
def _run(cmd: list[str]) -> str:
44+
proc = subprocess.run(cmd, capture_output=True, text=True)
45+
if proc.returncode != 0:
46+
raise RuntimeError(f"{cmd[0]} failed:\n{proc.stderr.strip()}")
47+
return proc.stdout
48+
49+
50+
def run_blast(
51+
organism_id: str,
52+
fasta: str | Path,
53+
model_ids: Sequence[str],
54+
ref_fastas: Sequence[str | Path],
55+
*,
56+
evalue: float = 1e-5,
57+
threads: int = 1,
58+
blastp: str | Path | None = None,
59+
makeblastdb: str | Path | None = None,
60+
) -> pd.DataFrame:
61+
"""Bidirectional BLAST+ between an organism and template organisms.
62+
63+
Returns the hits DataFrame (filtered at
64+
``evalue``). Requires BLAST+ (`blastp`, `makeblastdb`).
65+
"""
66+
model_ids = list(model_ids)
67+
ref_fastas = _as_list(ref_fastas)
68+
if len(model_ids) != len(ref_fastas):
69+
raise ValueError("model_ids and ref_fastas must have the same length.")
70+
blastp = resolve_binary("blastp", binary=blastp)
71+
makeblastdb = resolve_binary("makeblastdb", binary=makeblastdb)
72+
outfmt = "10 " + " ".join(_OUTFMT_FIELDS) # 10 = CSV
73+
74+
frames = []
75+
with tempfile.TemporaryDirectory() as tmp:
76+
tmp = Path(tmp)
77+
78+
def blastp_dir(query, subject_fasta, from_id, to_id):
79+
db = tmp / f"db_{from_id}_{to_id}"
80+
_run([makeblastdb, "-in", str(subject_fasta), "-dbtype", "prot", "-out", str(db)])
81+
out = _run([
82+
blastp, "-query", str(query), "-db", str(db), "-evalue", str(evalue),
83+
"-outfmt", outfmt, "-num_threads", str(threads),
84+
])
85+
return _parse_tabular(out, from_id, to_id, sep=",")
86+
87+
for model_id, ref in zip(model_ids, ref_fastas, strict=True):
88+
# template -> organism, and organism -> template
89+
frames.append(blastp_dir(ref, fasta, model_id, organism_id))
90+
frames.append(blastp_dir(fasta, ref, organism_id, model_id))
91+
return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame(columns=HIT_COLUMNS)
92+
93+
94+
def run_diamond(
95+
organism_id: str,
96+
fasta: str | Path,
97+
model_ids: Sequence[str],
98+
ref_fastas: Sequence[str | Path],
99+
*,
100+
evalue: float = 1e-5,
101+
threads: int = 1,
102+
sensitivity: str = "--more-sensitive",
103+
diamond: str | Path | None = None,
104+
) -> pd.DataFrame:
105+
"""Bidirectional DIAMOND between an organism and template organisms.
106+
107+
Returns the hits DataFrame. Requires DIAMOND.
108+
"""
109+
model_ids = list(model_ids)
110+
ref_fastas = _as_list(ref_fastas)
111+
if len(model_ids) != len(ref_fastas):
112+
raise ValueError("model_ids and ref_fastas must have the same length.")
113+
diamond = resolve_binary("diamond", binary=diamond)
114+
115+
frames = []
116+
with tempfile.TemporaryDirectory() as tmp:
117+
tmp = Path(tmp)
118+
119+
def diamond_dir(query, subject_fasta, from_id, to_id):
120+
db = tmp / f"db_{from_id}_{to_id}"
121+
_run([diamond, "makedb", "--in", str(subject_fasta), "--db", str(db)])
122+
cmd = [diamond, "blastp", "--query", str(query), "--db", str(db),
123+
"--evalue", str(evalue), "--outfmt", "6", *_OUTFMT_FIELDS,
124+
"--threads", str(threads)]
125+
if sensitivity:
126+
cmd.append(sensitivity)
127+
return _parse_tabular(_run(cmd), from_id, to_id, sep="\t")
128+
129+
for model_id, ref in zip(model_ids, ref_fastas, strict=True):
130+
frames.append(diamond_dir(ref, fasta, model_id, organism_id))
131+
frames.append(diamond_dir(fasta, ref, organism_id, model_id))
132+
return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame(columns=HIT_COLUMNS)
133+
134+
135+
def blast_from_table(source: str | Path | pd.DataFrame) -> pd.DataFrame:
136+
"""Load a precomputed homology hits table (CSV path or DataFrame).
137+
138+
a plain CSV/DataFrame, not Excel.
139+
Must contain the ``HIT_COLUMNS`` columns.
140+
"""
141+
# Force gene-id columns to str: an all-numeric gene-id column (e.g. Entrez ids)
142+
# would otherwise be read as int64 and never match the string gene ids in a model.
143+
df = (source if isinstance(source, pd.DataFrame)
144+
else pd.read_csv(source, dtype={"from_gene": str, "to_gene": str}))
145+
validate_hits(df)
146+
return df[HIT_COLUMNS].copy()
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
"""Homology hits table — the data structure shared across the homology track.
2+
3+
The hits are one tidy ``pandas.DataFrame`` of bidirectional hits, one row per hit.
4+
This is the currency between the BLAST / DIAMOND wrappers and
5+
:func:`get_model_from_homology`.
6+
7+
Columns (``HIT_COLUMNS``):
8+
``from_id, to_id`` (organism/model ids), ``from_gene, to_gene`` (the matched
9+
genes; ``from_gene`` is in ``from_id``), and the hit metrics
10+
``evalue, identity, align_len, bitscore, ppos``.
11+
"""
12+
from __future__ import annotations
13+
14+
from collections.abc import Iterable
15+
16+
import pandas as pd
17+
18+
HIT_COLUMNS = [
19+
"from_id", "to_id", "from_gene", "to_gene",
20+
"evalue", "identity", "align_len", "bitscore", "ppos",
21+
]
22+
23+
24+
def make_ortholog_hits(
25+
ortholog_pairs: Iterable[tuple[str, str]],
26+
source_model_id: str,
27+
target_id: str,
28+
) -> pd.DataFrame:
29+
"""Build a bidirectional hits table from a predefined ortholog list.
30+
31+
Each ``(source_gene, target_gene)``
32+
pair is emitted in both directions with sentinel metrics (evalue 0,
33+
identity 100, align_len 1000, bitscore 1000, ppos 100) so every pair passes
34+
any reasonable filter. Lets a known ortholog mapping feed
35+
:func:`get_model_from_homology` with no BLAST run — also the testing entry
36+
point.
37+
38+
Parameters
39+
----------
40+
ortholog_pairs
41+
Iterable of ``(source_gene, target_gene)`` — source = template/model
42+
organism, target = the organism being built.
43+
source_model_id
44+
ID of the template model the source genes belong to.
45+
target_id
46+
ID of the organism to build a model for (``model_for``).
47+
"""
48+
pairs = [(str(s), str(t)) for s, t in ortholog_pairs]
49+
if not pairs:
50+
raise ValueError("ortholog_pairs is empty.")
51+
52+
rows = []
53+
for source_gene, target_gene in pairs:
54+
rows.append((source_model_id, target_id, source_gene, target_gene, 0.0, 100.0, 1000, 1000.0, 100.0))
55+
rows.append((target_id, source_model_id, target_gene, source_gene, 0.0, 100.0, 1000, 1000.0, 100.0))
56+
return pd.DataFrame(rows, columns=HIT_COLUMNS)
57+
58+
59+
def validate_hits(hits: pd.DataFrame) -> pd.DataFrame:
60+
"""Check a hits DataFrame has the required columns; return it unchanged."""
61+
missing = [c for c in HIT_COLUMNS if c not in hits.columns]
62+
if missing:
63+
raise ValueError(f"hits is missing required columns: {missing}")
64+
return hits

0 commit comments

Comments
 (0)