SysBioChalmers
diff --git a/‎src/raven_python/analysis/__init__.py‎
Lines changed: 23 additions & 0 deletions b/‎src/raven_python/analysis/__init__.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎src/raven_python/analysis/fseof.py‎
Lines changed: 161 additions & 0 deletions b/‎src/raven_python/analysis/fseof.py‎
Lines changed: 161 additions & 0 deletions
diff --git a/‎src/raven_python/analysis/reporter.py‎
Lines changed: 117 additions & 0 deletions b/‎src/raven_python/analysis/reporter.py‎
Lines changed: 117 additions & 0 deletions
@@ -0,0 +1,23 @@
+"""Analyses not in cobrapy's core.
+
+* :func:`reporter_metabolites` — Reporter Metabolites (around-metabolite gene-score test).
+* :func:`fseof` — Flux Scanning based on Enforced Objective Flux.
+* :func:`random_sampling` — random-objective flux sampling.
+"""
+from raven_python.analysis.fseof import FSEOFResult, fseof
+from raven_python.analysis.reporter import ReporterResult, reporter_metabolites
+from raven_python.analysis.sampling import (
+    RandomSamplingResult,
+    find_good_reactions,
+    random_sampling,
+)
+
+__all__ = [
+    "FSEOFResult",
+    "RandomSamplingResult",
+    "ReporterResult",
+    "find_good_reactions",
+    "fseof",
+    "random_sampling",
+    "reporter_metabolites",
+]
@@ -0,0 +1,161 @@
+"""Flux Scanning based on Enforced Objective Flux — FSEOF (port + redesign).
+
+FSEOF (Choi et al., Appl Environ Microbiol 2010) finds metabolic-engineering targets
+for over-producing a metabolite: enforce an increasing flux toward the target product
+while optimising growth, and watch how each reaction's flux responds. This is a port
+of RAVEN's ``FSEOF`` with a substantially richer, more robust output (RAVEN's
+weaknesses are noted in IMPROVEMENTS, FS1–FS4):
+
+* **Robust trend, not strict monotonicity.** Each reaction's flux is regressed against
+  the enforced product flux across the scan; the **slope** is the response and the
+  **correlation** (|r|) is a quality score. A reaction is a target if it tracks the
+  product cleanly (|r| ≥ ``correlation_threshold``) — one noisy step from LP
+  alternative optima no longer discards it (and pFBA per step keeps the scan stable).
+* **Direction classification RAVEN lacks.** Targets are labelled ``amplify`` (|flux|
+  rises with the product → over-express), ``knockdown`` (|flux| falls), or ``knockout``
+  (|flux| → ~0 → delete). RAVEN only ever reports the amplification targets.
+* **Gene-level view** via :attr:`FSEOFResult.gene_targets`, and the full flux scan is
+  retained in :attr:`FSEOFResult.scan` — all as DataFrames, not a printed TSV.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import cobra
+import numpy as np
+import pandas as pd
+from cobra.exceptions import OptimizationError
+from cobra.flux_analysis import pfba
+from scipy.stats import linregress
+
+
+@dataclass
+class FSEOFResult:
+    """FSEOF output.
+
+    ``scan`` is reactions × enforced-flux-levels (the full flux scan); ``enforced`` are
+    the enforced target fluxes; ``targets`` is the classified per-reaction table
+    (sorted by score). :attr:`gene_targets` aggregates targets to genes.
+    """
+
+    scan: pd.DataFrame
+    enforced: list[float]
+    targets: pd.DataFrame
+
+    @property
+    def amplification(self) -> pd.DataFrame:
+        return self.targets[self.targets["target_type"] == "amplify"].reset_index(drop=True)
+
+    @property
+    def knockout(self) -> pd.DataFrame:
+        mask = self.targets["target_type"].isin(["knockout", "knockdown"])
+        return self.targets[mask].reset_index(drop=True)
+
+    @property
+    def gene_targets(self) -> pd.DataFrame:
+        """Per-gene aggregation: the target reactions each gene is associated with."""
+        rows = []
+        for _, t in self.targets.iterrows():
+            for gene in t["genes"]:
+                rows.append({"gene": gene, "reaction": t["reaction"],
+                             "target_type": t["target_type"], "slope": t["slope"]})
+        if not rows:
+            return pd.DataFrame(columns=["gene", "target_type", "reactions", "max_abs_slope"])
+        df = pd.DataFrame(rows)
+        agg = df.groupby("gene").agg(
+            target_type=("target_type", lambda s: ";".join(sorted(set(s)))),
+            reactions=("reaction", lambda s: ";".join(sorted(set(s)))),
+            max_abs_slope=("slope", lambda s: float(np.max(np.abs(s)))),
+        ).reset_index()
+        return agg.sort_values("max_abs_slope", ascending=False, ignore_index=True)
+
+
+def fseof(
+    model: cobra.Model,
+    target_rxn: str,
+    *,
+    biomass_rxn: str | None = None,
+    n_steps: int = 10,
+    max_fraction: float = 0.9,
+    correlation_threshold: float = 0.9,
+    flux_eps: float = 1e-6,
+) -> FSEOFResult:
+    """Run FSEOF for over-production of ``target_rxn``'s product.
+
+    Enforces target flux from ``max_fraction/n_steps`` up to ``max_fraction`` of the
+    theoretical maximum in ``n_steps`` steps, maximising growth (``biomass_rxn`` or the
+    model's current objective) with pFBA at each step. Returns an :class:`FSEOFResult`.
+    """
+    with model:  # find the theoretical maximum target flux
+        model.objective = target_rxn
+        target_opt = model.slim_optimize()
+    # slim_optimize returns NaN on an infeasible model; np.isfinite catches that too.
+    if target_opt is None or not np.isfinite(target_opt) or target_opt <= flux_eps:
+        raise ValueError(f"{target_rxn!r} cannot carry positive flux; nothing to scan.")
+    target_max = target_opt * max_fraction
+    levels = [target_max * (i + 1) / n_steps for i in range(n_steps)]
+
+    columns: dict[float, pd.Series] = {}
+    enforced: list[float] = []
+    for level in levels:
+        with model:
+            if biomass_rxn is not None:
+                model.objective = biomass_rxn
+            model.reactions.get_by_id(target_rxn).lower_bound = level
+            try:
+                columns[level] = pfba(model).fluxes
+            except OptimizationError:
+                break  # enforced flux became infeasible — stop scanning
+            enforced.append(level)
+    if len(enforced) < 2:
+        raise RuntimeError("FSEOF needs at least two feasible enforced-flux levels.")
+
+    scan = pd.DataFrame(columns)
+    targets = _classify(model, scan, np.asarray(enforced), correlation_threshold, flux_eps)
+    return FSEOFResult(scan=scan, enforced=enforced, targets=targets)
+
+
+def _classify(model, scan, enforced, corr_threshold, flux_eps) -> pd.DataFrame:
+    rows = []
+    for rxn in model.reactions:
+        flux = scan.loc[rxn.id, enforced.tolist() if hasattr(enforced, "tolist") else enforced]
+        flux = flux.to_numpy(dtype=float)
+        initial, final = flux[0], flux[-1]
+        if flux.std() < flux_eps:  # flat -> no response
+            continue
+        fit = linregress(enforced, flux)
+        slope, corr = float(fit.slope), float(fit.rvalue)
+        if abs(corr) < corr_threshold or abs(slope) < flux_eps:
+            continue
+        # Classify on the slope of |flux| vs the enforced product flux — the
+        # criterion the docstring states (|flux| rises = amplify, etc.). The
+        # old endpoint-only check (``abs(final) vs abs(initial)``) could
+        # mislabel a track whose first/last values straddled a peak/trough but
+        # whose overall trend was the opposite. Keep ``knockout`` for tracks
+        # the regression drives essentially to zero.
+        abs_fit = linregress(enforced, np.abs(flux))
+        abs_slope = float(abs_fit.slope)
+        if abs(final) < flux_eps and abs_slope < 0:
+            ttype = "knockout"
+        elif abs_slope > 0:
+            ttype = "amplify"
+        else:
+            ttype = "knockdown"
+        rows.append({
+            "reaction": rxn.id,
+            "name": rxn.name,
+            "subsystem": rxn.subsystem,
+            "gene_reaction_rule": rxn.gene_reaction_rule,
+            "genes": sorted(g.id for g in rxn.genes),
+            "target_type": ttype,
+            "slope": slope,
+            "correlation": corr,
+            "initial_flux": initial,
+            "final_flux": final,
+            "score": abs(slope) * abs(corr),
+        })
+    table = pd.DataFrame(rows, columns=[
+        "reaction", "name", "subsystem", "gene_reaction_rule", "genes",
+        "target_type", "slope", "correlation", "initial_flux", "final_flux", "score",
+    ])
+    return table.sort_values("score", ascending=False, ignore_index=True)
@@ -0,0 +1,117 @@
+"""Reporter Metabolites — metabolites around which transcriptional change concentrates.
+
+Patil & Nielsen, PNAS 2005. Each gene's differential-expression p-value becomes a
+Z-score ``z = -Φ⁻¹(p)``; for every metabolite the Z-scores of the genes on its
+neighbouring reactions are aggregated (``Σz / √n``), background-corrected, and turned
+back into a p-value.
+
+The background correction has an exact closed form (sampling with replacement from the
+scored-gene pool: a random ``Σz/√n`` has mean ``√n·μ`` and standard deviation ``σ``
+with μ, σ the mean/std of the scored Z-scores), so the corrected score is just
+``(metZ − √n·μ) / σ`` — no Monte-Carlo sampling needed.
+"""
+from __future__ import annotations
+
+import math
+from collections.abc import Mapping
+from dataclasses import dataclass
+
+import cobra
+import numpy as np
+import pandas as pd
+from scipy.stats import norm
+
+_CLAMP = 15.0  # |Z| cap for p-values of exactly 0 or 1 (RAVEN's ±15)
+
+
+@dataclass
+class ReporterResult:
+    """Reporter-metabolite scores for one gene set.
+
+    ``test`` is ``"all"``, ``"up"`` or ``"down"``; ``table`` is a DataFrame with
+    columns ``metabolite, name, z_score, p_value, n_genes, mean_z, std_z`` sorted by
+    descending ``z_score``.
+    """
+
+    test: str
+    table: pd.DataFrame
+
+
+def _gene_z(pvalues: dict[str, float]) -> dict[str, float]:
+    genes = list(pvalues)
+    z = -norm.ppf([pvalues[g] for g in genes])
+    z = np.where(np.isposinf(z), _CLAMP, z)
+    z = np.where(np.isneginf(z), -_CLAMP, z)
+    return dict(zip(genes, z, strict=True))
+
+
+def _reporter_one(model: cobra.Model, gene_z: dict[str, float], test: str) -> ReporterResult:
+    z_values = np.fromiter(gene_z.values(), dtype=float)
+    mu = float(z_values.mean()) if z_values.size else 0.0
+    sigma = float(z_values.std(ddof=0)) if z_values.size else 0.0
+
+    rows = []
+    for met in model.metabolites:
+        neighbours = {g.id for rxn in met.reactions for g in rxn.genes if g.id in gene_z}
+        if not neighbours:
+            continue
+        zs = np.array([gene_z[g] for g in neighbours])
+        n = zs.size
+        raw = zs.sum() / math.sqrt(n)
+        # Exact background correction for sampling-with-replacement (see module doc).
+        corrected = (raw - math.sqrt(n) * mu) / sigma if sigma > 0 else 0.0
+        rows.append(
+            {
+                "metabolite": met.id,
+                "name": met.name or met.id,
+                "z_score": corrected,
+                "p_value": float(1.0 - norm.cdf(corrected)),
+                "n_genes": n,
+                "mean_z": float(zs.mean()),
+                "std_z": float(zs.std(ddof=1)) if n > 1 else float("nan"),
+            }
+        )
+    table = pd.DataFrame(rows, columns=["metabolite", "name", "z_score", "p_value", "n_genes", "mean_z", "std_z"])
+    table = table.sort_values("z_score", ascending=False, ignore_index=True)
+    return ReporterResult(test, table)
+
+
+def reporter_metabolites(
+    model: cobra.Model,
+    gene_pvalues: Mapping[str, float],
+    *,
+    gene_fold_changes: Mapping[str, float] | None = None,
+) -> list[ReporterResult]:
+    """Compute Reporter Metabolites from per-gene differential-expression p-values.
+
+    ``gene_pvalues`` maps gene id → p-value (genes not in the model, or with a NaN or
+    out-of-``[0, 1]`` p-value, are dropped — a stray invalid p-value would otherwise
+    turn the whole result NaN). If ``gene_fold_changes`` (gene id → log fold change)
+    is given, two extra results are returned for the up- (fc ≥ 0) and down- (fc < 0)
+    regulated gene subsets, in addition to ``"all"``.
+
+    Parity with RAVEN's ``reporterMetabolites``: the ``z_score`` and underlying
+    background correction match exactly (exact closed-form instead of RAVEN's
+    Monte-Carlo, see IMPROVEMENTS RM1). The reported ``p_value`` is the
+    *one-sided* (``"up"``) enrichment ``1 - Φ(z)`` and the result is sorted by
+    ``z_score`` descending. RAVEN sorts by p-value and reports both tails
+    (``allPValues``, ``allUpPValues``, ``allDownPValues``); the up/down splits
+    here come from the ``gene_fold_changes`` subset partition instead, so the
+    same information is available via the three returned ``ReporterResult``
+    rows.
+    """
+    model_genes = {g.id for g in model.genes}
+    scored = {
+        g: float(p)
+        for g, p in gene_pvalues.items()
+        if g in model_genes and p is not None and not math.isnan(p) and 0.0 <= p <= 1.0
+    }
+    gene_z = _gene_z(scored)
+    results = [_reporter_one(model, gene_z, "all")]
+
+    if gene_fold_changes is not None:
+        up = {g: z for g, z in gene_z.items() if gene_fold_changes.get(g, 0.0) >= 0}
+        down = {g: z for g, z in gene_z.items() if gene_fold_changes.get(g, 0.0) < 0}
+        results.append(_reporter_one(model, up, "up"))
+        results.append(_reporter_one(model, down, "down"))
+    return results