SysBioChalmers
diff --git a/‎src/raven_python/reconstruction/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎src/raven_python/reconstruction/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/raven_python/reconstruction/homology/__init__.py‎
Lines changed: 19 additions & 0 deletions b/‎src/raven_python/reconstruction/homology/__init__.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎src/raven_python/reconstruction/homology/blast.py‎
Lines changed: 146 additions & 0 deletions b/‎src/raven_python/reconstruction/homology/blast.py‎
Lines changed: 146 additions & 0 deletions
diff --git a/‎src/raven_python/reconstruction/homology/hits.py‎
Lines changed: 64 additions & 0 deletions b/‎src/raven_python/reconstruction/homology/hits.py‎
Lines changed: 64 additions & 0 deletions
@@ -0,0 +1 @@
+"""De novo reconstruction from KEGG and protein homology (BLAST/DIAMOND)."""
@@ -0,0 +1,19 @@
+"""Homology-based reconstruction from template models (getModelFromHomology, BLAST/DIAMOND)."""
+from raven_python.reconstruction.homology.blast import (
+    blast_from_table,
+    run_blast,
+    run_diamond,
+)
+from raven_python.reconstruction.homology.hits import HIT_COLUMNS, make_ortholog_hits, validate_hits
+from raven_python.reconstruction.homology.homology import HomologyResult, get_model_from_homology
+
+__all__ = [
+    "HIT_COLUMNS",
+    "HomologyResult",
+    "blast_from_table",
+    "get_model_from_homology",
+    "make_ortholog_hits",
+    "run_blast",
+    "run_diamond",
+    "validate_hits",
+]
@@ -0,0 +1,146 @@
+"""Run BLAST+ / DIAMOND (or load precomputed hits) into a homology hits table.
+
+Each producer returns the bidirectional hits DataFrame (``HIT_COLUMNS``) consumed by
+:func:`~raven_python.reconstruction.homology.get_model_from_homology`. Binaries are
+located via :func:`raven_python.binaries.resolve_binary` (arg → env → PATH → bundled).
+"""
+from __future__ import annotations
+
+import io
+import subprocess
+import tempfile
+from collections.abc import Sequence
+from pathlib import Path
+
+import pandas as pd
+
+from raven_python.binaries import resolve_binary
+from raven_python.reconstruction.homology.hits import HIT_COLUMNS, validate_hits
+
+# Tabular output columns requested from BLAST+/DIAMOND, in order.
+_OUTFMT_FIELDS = ["qseqid", "sseqid", "evalue", "pident", "length", "bitscore", "ppos"]
+_FIELD_TO_HIT = {
+    "qseqid": "from_gene", "sseqid": "to_gene", "evalue": "evalue",
+    "pident": "identity", "length": "align_len", "bitscore": "bitscore", "ppos": "ppos",
+}
+
+
+def _parse_tabular(text: str, from_id: str, to_id: str, sep: str) -> pd.DataFrame:
+    """Parse one BLAST/DIAMOND tabular output into hit rows for one direction."""
+    if not text.strip():
+        return pd.DataFrame(columns=HIT_COLUMNS)
+    df = pd.read_csv(io.StringIO(text), sep=sep, names=_OUTFMT_FIELDS, dtype={0: str, 1: str})
+    df = df.rename(columns=_FIELD_TO_HIT)
+    df["from_id"] = from_id
+    df["to_id"] = to_id
+    return df[HIT_COLUMNS]
+
+
+def _as_list(x):
+    return [x] if isinstance(x, (str, Path)) else list(x)
+
+
+def _run(cmd: list[str]) -> str:
+    proc = subprocess.run(cmd, capture_output=True, text=True)
+    if proc.returncode != 0:
+        raise RuntimeError(f"{cmd[0]} failed:\n{proc.stderr.strip()}")
+    return proc.stdout
+
+
+def run_blast(
+    organism_id: str,
+    fasta: str | Path,
+    model_ids: Sequence[str],
+    ref_fastas: Sequence[str | Path],
+    *,
+    evalue: float = 1e-5,
+    threads: int = 1,
+    blastp: str | Path | None = None,
+    makeblastdb: str | Path | None = None,
+) -> pd.DataFrame:
+    """Bidirectional BLAST+ between an organism and template organisms.
+
+    Returns the hits DataFrame (filtered at
+    ``evalue``). Requires BLAST+ (`blastp`, `makeblastdb`).
+    """
+    model_ids = list(model_ids)
+    ref_fastas = _as_list(ref_fastas)
+    if len(model_ids) != len(ref_fastas):
+        raise ValueError("model_ids and ref_fastas must have the same length.")
+    blastp = resolve_binary("blastp", binary=blastp)
+    makeblastdb = resolve_binary("makeblastdb", binary=makeblastdb)
+    outfmt = "10 " + " ".join(_OUTFMT_FIELDS)  # 10 = CSV
+
+    frames = []
+    with tempfile.TemporaryDirectory() as tmp:
+        tmp = Path(tmp)
+
+        def blastp_dir(query, subject_fasta, from_id, to_id):
+            db = tmp / f"db_{from_id}_{to_id}"
+            _run([makeblastdb, "-in", str(subject_fasta), "-dbtype", "prot", "-out", str(db)])
+            out = _run([
+                blastp, "-query", str(query), "-db", str(db), "-evalue", str(evalue),
+                "-outfmt", outfmt, "-num_threads", str(threads),
+            ])
+            return _parse_tabular(out, from_id, to_id, sep=",")
+
+        for model_id, ref in zip(model_ids, ref_fastas, strict=True):
+            # template -> organism, and organism -> template
+            frames.append(blastp_dir(ref, fasta, model_id, organism_id))
+            frames.append(blastp_dir(fasta, ref, organism_id, model_id))
+    return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame(columns=HIT_COLUMNS)
+
+
+def run_diamond(
+    organism_id: str,
+    fasta: str | Path,
+    model_ids: Sequence[str],
+    ref_fastas: Sequence[str | Path],
+    *,
+    evalue: float = 1e-5,
+    threads: int = 1,
+    sensitivity: str = "--more-sensitive",
+    diamond: str | Path | None = None,
+) -> pd.DataFrame:
+    """Bidirectional DIAMOND between an organism and template organisms.
+
+    Returns the hits DataFrame. Requires DIAMOND.
+    """
+    model_ids = list(model_ids)
+    ref_fastas = _as_list(ref_fastas)
+    if len(model_ids) != len(ref_fastas):
+        raise ValueError("model_ids and ref_fastas must have the same length.")
+    diamond = resolve_binary("diamond", binary=diamond)
+
+    frames = []
+    with tempfile.TemporaryDirectory() as tmp:
+        tmp = Path(tmp)
+
+        def diamond_dir(query, subject_fasta, from_id, to_id):
+            db = tmp / f"db_{from_id}_{to_id}"
+            _run([diamond, "makedb", "--in", str(subject_fasta), "--db", str(db)])
+            cmd = [diamond, "blastp", "--query", str(query), "--db", str(db),
+                   "--evalue", str(evalue), "--outfmt", "6", *_OUTFMT_FIELDS,
+                   "--threads", str(threads)]
+            if sensitivity:
+                cmd.append(sensitivity)
+            return _parse_tabular(_run(cmd), from_id, to_id, sep="\t")
+
+        for model_id, ref in zip(model_ids, ref_fastas, strict=True):
+            frames.append(diamond_dir(ref, fasta, model_id, organism_id))
+            frames.append(diamond_dir(fasta, ref, organism_id, model_id))
+    return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame(columns=HIT_COLUMNS)
+
+
+def blast_from_table(source: str | Path | pd.DataFrame) -> pd.DataFrame:
+    """Load a precomputed homology hits table (CSV path or DataFrame).
+
+    a plain CSV/DataFrame, not Excel.
+    Must contain the ``HIT_COLUMNS`` columns.
+    """
+    # Force gene-id columns to str: an all-numeric gene-id column (e.g. Entrez ids)
+    # would otherwise be read as int64 and never match the string gene ids in a model.
+    df = (source if isinstance(source, pd.DataFrame)
+          else pd.read_csv(source, dtype={"from_gene": str, "to_gene": str}))
+    validate_hits(df)
+    return df[HIT_COLUMNS].copy()
@@ -0,0 +1,64 @@
+"""Homology hits table — the data structure shared across the homology track.
+
+The hits are one tidy ``pandas.DataFrame`` of bidirectional hits, one row per hit.
+This is the currency between the BLAST / DIAMOND wrappers and
+:func:`get_model_from_homology`.
+
+Columns (``HIT_COLUMNS``):
+``from_id, to_id`` (organism/model ids), ``from_gene, to_gene`` (the matched
+genes; ``from_gene`` is in ``from_id``), and the hit metrics
+``evalue, identity, align_len, bitscore, ppos``.
+"""
+from __future__ import annotations
+
+from collections.abc import Iterable
+
+import pandas as pd
+
+HIT_COLUMNS = [
+    "from_id", "to_id", "from_gene", "to_gene",
+    "evalue", "identity", "align_len", "bitscore", "ppos",
+]
+
+
+def make_ortholog_hits(
+    ortholog_pairs: Iterable[tuple[str, str]],
+    source_model_id: str,
+    target_id: str,
+) -> pd.DataFrame:
+    """Build a bidirectional hits table from a predefined ortholog list.
+
+    Each ``(source_gene, target_gene)``
+    pair is emitted in both directions with sentinel metrics (evalue 0,
+    identity 100, align_len 1000, bitscore 1000, ppos 100) so every pair passes
+    any reasonable filter. Lets a known ortholog mapping feed
+    :func:`get_model_from_homology` with no BLAST run — also the testing entry
+    point.
+
+    Parameters
+    ----------
+    ortholog_pairs
+        Iterable of ``(source_gene, target_gene)`` — source = template/model
+        organism, target = the organism being built.
+    source_model_id
+        ID of the template model the source genes belong to.
+    target_id
+        ID of the organism to build a model for (``model_for``).
+    """
+    pairs = [(str(s), str(t)) for s, t in ortholog_pairs]
+    if not pairs:
+        raise ValueError("ortholog_pairs is empty.")
+
+    rows = []
+    for source_gene, target_gene in pairs:
+        rows.append((source_model_id, target_id, source_gene, target_gene, 0.0, 100.0, 1000, 1000.0, 100.0))
+        rows.append((target_id, source_model_id, target_gene, source_gene, 0.0, 100.0, 1000, 1000.0, 100.0))
+    return pd.DataFrame(rows, columns=HIT_COLUMNS)
+
+
+def validate_hits(hits: pd.DataFrame) -> pd.DataFrame:
+    """Check a hits DataFrame has the required columns; return it unchanged."""
+    missing = [c for c in HIT_COLUMNS if c not in hits.columns]
+    if missing:
+        raise ValueError(f"hits is missing required columns: {missing}")
+    return hits
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+"""De novo reconstruction from KEGG and protein homology (BLAST/DIAMOND)."""`