Skip to content

Commit 7deb8d4

Browse files
authored
Omics ingestion and standard analyses (#8)
* Project scaffold: pyproject + package skeleton + README + LICENSE * Add GitHub Actions CI and the maintainer-scripts README * Add the foundation utilities: GPR, balance, parse, sort, validate * Add the model-manipulation layer (add, remove, transport, merge, etc.) * Add binary + data resolvers for external tools and published artefacts * Add YAML and SIF model I/O * Add Excel export and the Standard-GEM git-layout export * Add BLAST and DIAMOND wrappers for protein-homology searches * Add the homology-based draft model builder (getModelFromHomology port) * Add KEGG download, dump parser and taxonomy parser * Add KEGG HMM-library build and HMM-based KO assignment * Add KEGG species-model assembly (per-organism reconstruction) * Add KEGG artefact-build scripts and HMM-cutoff calibration docs * Add metabolic-task parsing and the check_tasks validator * Add connectivity gap-filling (MILP) against template models * Add the tINIT (INIT) MILP and its supporting machinery * Add the ftINIT pipeline and task-aware gap-filling * Add Human-GEM validation, parameter studies and cross-solver tests * Add HPA omics ingestion (proteomics + RNA-seq) * Add FSEOF, reporter metabolites and flux sampling * Add N-model comparison (presence + Jaccard + optional task check)
1 parent 8bd38c5 commit 7deb8d4

13 files changed

Lines changed: 1488 additions & 0 deletions

File tree

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
"""Analyses not in cobrapy's core.
2+
3+
* :func:`reporter_metabolites` — Reporter Metabolites (around-metabolite gene-score test).
4+
* :func:`fseof` — Flux Scanning based on Enforced Objective Flux.
5+
* :func:`random_sampling` — random-objective flux sampling.
6+
"""
7+
from raven_python.analysis.fseof import FSEOFResult, fseof
8+
from raven_python.analysis.reporter import ReporterResult, reporter_metabolites
9+
from raven_python.analysis.sampling import (
10+
RandomSamplingResult,
11+
find_good_reactions,
12+
random_sampling,
13+
)
14+
15+
__all__ = [
16+
"FSEOFResult",
17+
"RandomSamplingResult",
18+
"ReporterResult",
19+
"find_good_reactions",
20+
"fseof",
21+
"random_sampling",
22+
"reporter_metabolites",
23+
]

src/raven_python/analysis/fseof.py

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
"""Flux Scanning based on Enforced Objective Flux — FSEOF (port + redesign).
2+
3+
FSEOF (Choi et al., Appl Environ Microbiol 2010) finds metabolic-engineering targets
4+
for over-producing a metabolite: enforce an increasing flux toward the target product
5+
while optimising growth, and watch how each reaction's flux responds. This is a port
6+
of RAVEN's ``FSEOF`` with a substantially richer, more robust output (RAVEN's
7+
weaknesses are noted in IMPROVEMENTS, FS1–FS4):
8+
9+
* **Robust trend, not strict monotonicity.** Each reaction's flux is regressed against
10+
the enforced product flux across the scan; the **slope** is the response and the
11+
**correlation** (|r|) is a quality score. A reaction is a target if it tracks the
12+
product cleanly (|r| ≥ ``correlation_threshold``) — one noisy step from LP
13+
alternative optima no longer discards it (and pFBA per step keeps the scan stable).
14+
* **Direction classification RAVEN lacks.** Targets are labelled ``amplify`` (|flux|
15+
rises with the product → over-express), ``knockdown`` (|flux| falls), or ``knockout``
16+
(|flux| → ~0 → delete). RAVEN only ever reports the amplification targets.
17+
* **Gene-level view** via :attr:`FSEOFResult.gene_targets`, and the full flux scan is
18+
retained in :attr:`FSEOFResult.scan` — all as DataFrames, not a printed TSV.
19+
"""
20+
from __future__ import annotations
21+
22+
from dataclasses import dataclass
23+
24+
import cobra
25+
import numpy as np
26+
import pandas as pd
27+
from cobra.exceptions import OptimizationError
28+
from cobra.flux_analysis import pfba
29+
from scipy.stats import linregress
30+
31+
32+
@dataclass
33+
class FSEOFResult:
34+
"""FSEOF output.
35+
36+
``scan`` is reactions × enforced-flux-levels (the full flux scan); ``enforced`` are
37+
the enforced target fluxes; ``targets`` is the classified per-reaction table
38+
(sorted by score). :attr:`gene_targets` aggregates targets to genes.
39+
"""
40+
41+
scan: pd.DataFrame
42+
enforced: list[float]
43+
targets: pd.DataFrame
44+
45+
@property
46+
def amplification(self) -> pd.DataFrame:
47+
return self.targets[self.targets["target_type"] == "amplify"].reset_index(drop=True)
48+
49+
@property
50+
def knockout(self) -> pd.DataFrame:
51+
mask = self.targets["target_type"].isin(["knockout", "knockdown"])
52+
return self.targets[mask].reset_index(drop=True)
53+
54+
@property
55+
def gene_targets(self) -> pd.DataFrame:
56+
"""Per-gene aggregation: the target reactions each gene is associated with."""
57+
rows = []
58+
for _, t in self.targets.iterrows():
59+
for gene in t["genes"]:
60+
rows.append({"gene": gene, "reaction": t["reaction"],
61+
"target_type": t["target_type"], "slope": t["slope"]})
62+
if not rows:
63+
return pd.DataFrame(columns=["gene", "target_type", "reactions", "max_abs_slope"])
64+
df = pd.DataFrame(rows)
65+
agg = df.groupby("gene").agg(
66+
target_type=("target_type", lambda s: ";".join(sorted(set(s)))),
67+
reactions=("reaction", lambda s: ";".join(sorted(set(s)))),
68+
max_abs_slope=("slope", lambda s: float(np.max(np.abs(s)))),
69+
).reset_index()
70+
return agg.sort_values("max_abs_slope", ascending=False, ignore_index=True)
71+
72+
73+
def fseof(
74+
model: cobra.Model,
75+
target_rxn: str,
76+
*,
77+
biomass_rxn: str | None = None,
78+
n_steps: int = 10,
79+
max_fraction: float = 0.9,
80+
correlation_threshold: float = 0.9,
81+
flux_eps: float = 1e-6,
82+
) -> FSEOFResult:
83+
"""Run FSEOF for over-production of ``target_rxn``'s product.
84+
85+
Enforces target flux from ``max_fraction/n_steps`` up to ``max_fraction`` of the
86+
theoretical maximum in ``n_steps`` steps, maximising growth (``biomass_rxn`` or the
87+
model's current objective) with pFBA at each step. Returns an :class:`FSEOFResult`.
88+
"""
89+
with model: # find the theoretical maximum target flux
90+
model.objective = target_rxn
91+
target_opt = model.slim_optimize()
92+
# slim_optimize returns NaN on an infeasible model; np.isfinite catches that too.
93+
if target_opt is None or not np.isfinite(target_opt) or target_opt <= flux_eps:
94+
raise ValueError(f"{target_rxn!r} cannot carry positive flux; nothing to scan.")
95+
target_max = target_opt * max_fraction
96+
levels = [target_max * (i + 1) / n_steps for i in range(n_steps)]
97+
98+
columns: dict[float, pd.Series] = {}
99+
enforced: list[float] = []
100+
for level in levels:
101+
with model:
102+
if biomass_rxn is not None:
103+
model.objective = biomass_rxn
104+
model.reactions.get_by_id(target_rxn).lower_bound = level
105+
try:
106+
columns[level] = pfba(model).fluxes
107+
except OptimizationError:
108+
break # enforced flux became infeasible — stop scanning
109+
enforced.append(level)
110+
if len(enforced) < 2:
111+
raise RuntimeError("FSEOF needs at least two feasible enforced-flux levels.")
112+
113+
scan = pd.DataFrame(columns)
114+
targets = _classify(model, scan, np.asarray(enforced), correlation_threshold, flux_eps)
115+
return FSEOFResult(scan=scan, enforced=enforced, targets=targets)
116+
117+
118+
def _classify(model, scan, enforced, corr_threshold, flux_eps) -> pd.DataFrame:
119+
rows = []
120+
for rxn in model.reactions:
121+
flux = scan.loc[rxn.id, enforced.tolist() if hasattr(enforced, "tolist") else enforced]
122+
flux = flux.to_numpy(dtype=float)
123+
initial, final = flux[0], flux[-1]
124+
if flux.std() < flux_eps: # flat -> no response
125+
continue
126+
fit = linregress(enforced, flux)
127+
slope, corr = float(fit.slope), float(fit.rvalue)
128+
if abs(corr) < corr_threshold or abs(slope) < flux_eps:
129+
continue
130+
# Classify on the slope of |flux| vs the enforced product flux — the
131+
# criterion the docstring states (|flux| rises = amplify, etc.). The
132+
# old endpoint-only check (``abs(final) vs abs(initial)``) could
133+
# mislabel a track whose first/last values straddled a peak/trough but
134+
# whose overall trend was the opposite. Keep ``knockout`` for tracks
135+
# the regression drives essentially to zero.
136+
abs_fit = linregress(enforced, np.abs(flux))
137+
abs_slope = float(abs_fit.slope)
138+
if abs(final) < flux_eps and abs_slope < 0:
139+
ttype = "knockout"
140+
elif abs_slope > 0:
141+
ttype = "amplify"
142+
else:
143+
ttype = "knockdown"
144+
rows.append({
145+
"reaction": rxn.id,
146+
"name": rxn.name,
147+
"subsystem": rxn.subsystem,
148+
"gene_reaction_rule": rxn.gene_reaction_rule,
149+
"genes": sorted(g.id for g in rxn.genes),
150+
"target_type": ttype,
151+
"slope": slope,
152+
"correlation": corr,
153+
"initial_flux": initial,
154+
"final_flux": final,
155+
"score": abs(slope) * abs(corr),
156+
})
157+
table = pd.DataFrame(rows, columns=[
158+
"reaction", "name", "subsystem", "gene_reaction_rule", "genes",
159+
"target_type", "slope", "correlation", "initial_flux", "final_flux", "score",
160+
])
161+
return table.sort_values("score", ascending=False, ignore_index=True)
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
"""Reporter Metabolites — metabolites around which transcriptional change concentrates.
2+
3+
Patil & Nielsen, PNAS 2005. Each gene's differential-expression p-value becomes a
4+
Z-score ``z = -Φ⁻¹(p)``; for every metabolite the Z-scores of the genes on its
5+
neighbouring reactions are aggregated (``Σz / √n``), background-corrected, and turned
6+
back into a p-value.
7+
8+
The background correction has an exact closed form (sampling with replacement from the
9+
scored-gene pool: a random ``Σz/√n`` has mean ``√n·μ`` and standard deviation ``σ``
10+
with μ, σ the mean/std of the scored Z-scores), so the corrected score is just
11+
``(metZ − √n·μ) / σ`` — no Monte-Carlo sampling needed.
12+
"""
13+
from __future__ import annotations
14+
15+
import math
16+
from collections.abc import Mapping
17+
from dataclasses import dataclass
18+
19+
import cobra
20+
import numpy as np
21+
import pandas as pd
22+
from scipy.stats import norm
23+
24+
_CLAMP = 15.0 # |Z| cap for p-values of exactly 0 or 1 (RAVEN's ±15)
25+
26+
27+
@dataclass
28+
class ReporterResult:
29+
"""Reporter-metabolite scores for one gene set.
30+
31+
``test`` is ``"all"``, ``"up"`` or ``"down"``; ``table`` is a DataFrame with
32+
columns ``metabolite, name, z_score, p_value, n_genes, mean_z, std_z`` sorted by
33+
descending ``z_score``.
34+
"""
35+
36+
test: str
37+
table: pd.DataFrame
38+
39+
40+
def _gene_z(pvalues: dict[str, float]) -> dict[str, float]:
41+
genes = list(pvalues)
42+
z = -norm.ppf([pvalues[g] for g in genes])
43+
z = np.where(np.isposinf(z), _CLAMP, z)
44+
z = np.where(np.isneginf(z), -_CLAMP, z)
45+
return dict(zip(genes, z, strict=True))
46+
47+
48+
def _reporter_one(model: cobra.Model, gene_z: dict[str, float], test: str) -> ReporterResult:
49+
z_values = np.fromiter(gene_z.values(), dtype=float)
50+
mu = float(z_values.mean()) if z_values.size else 0.0
51+
sigma = float(z_values.std(ddof=0)) if z_values.size else 0.0
52+
53+
rows = []
54+
for met in model.metabolites:
55+
neighbours = {g.id for rxn in met.reactions for g in rxn.genes if g.id in gene_z}
56+
if not neighbours:
57+
continue
58+
zs = np.array([gene_z[g] for g in neighbours])
59+
n = zs.size
60+
raw = zs.sum() / math.sqrt(n)
61+
# Exact background correction for sampling-with-replacement (see module doc).
62+
corrected = (raw - math.sqrt(n) * mu) / sigma if sigma > 0 else 0.0
63+
rows.append(
64+
{
65+
"metabolite": met.id,
66+
"name": met.name or met.id,
67+
"z_score": corrected,
68+
"p_value": float(1.0 - norm.cdf(corrected)),
69+
"n_genes": n,
70+
"mean_z": float(zs.mean()),
71+
"std_z": float(zs.std(ddof=1)) if n > 1 else float("nan"),
72+
}
73+
)
74+
table = pd.DataFrame(rows, columns=["metabolite", "name", "z_score", "p_value", "n_genes", "mean_z", "std_z"])
75+
table = table.sort_values("z_score", ascending=False, ignore_index=True)
76+
return ReporterResult(test, table)
77+
78+
79+
def reporter_metabolites(
80+
model: cobra.Model,
81+
gene_pvalues: Mapping[str, float],
82+
*,
83+
gene_fold_changes: Mapping[str, float] | None = None,
84+
) -> list[ReporterResult]:
85+
"""Compute Reporter Metabolites from per-gene differential-expression p-values.
86+
87+
``gene_pvalues`` maps gene id → p-value (genes not in the model, or with a NaN or
88+
out-of-``[0, 1]`` p-value, are dropped — a stray invalid p-value would otherwise
89+
turn the whole result NaN). If ``gene_fold_changes`` (gene id → log fold change)
90+
is given, two extra results are returned for the up- (fc ≥ 0) and down- (fc < 0)
91+
regulated gene subsets, in addition to ``"all"``.
92+
93+
Parity with RAVEN's ``reporterMetabolites``: the ``z_score`` and underlying
94+
background correction match exactly (exact closed-form instead of RAVEN's
95+
Monte-Carlo, see IMPROVEMENTS RM1). The reported ``p_value`` is the
96+
*one-sided* (``"up"``) enrichment ``1 - Φ(z)`` and the result is sorted by
97+
``z_score`` descending. RAVEN sorts by p-value and reports both tails
98+
(``allPValues``, ``allUpPValues``, ``allDownPValues``); the up/down splits
99+
here come from the ``gene_fold_changes`` subset partition instead, so the
100+
same information is available via the three returned ``ReporterResult``
101+
rows.
102+
"""
103+
model_genes = {g.id for g in model.genes}
104+
scored = {
105+
g: float(p)
106+
for g, p in gene_pvalues.items()
107+
if g in model_genes and p is not None and not math.isnan(p) and 0.0 <= p <= 1.0
108+
}
109+
gene_z = _gene_z(scored)
110+
results = [_reporter_one(model, gene_z, "all")]
111+
112+
if gene_fold_changes is not None:
113+
up = {g: z for g, z in gene_z.items() if gene_fold_changes.get(g, 0.0) >= 0}
114+
down = {g: z for g, z in gene_z.items() if gene_fold_changes.get(g, 0.0) < 0}
115+
results.append(_reporter_one(model, up, "up"))
116+
results.append(_reporter_one(model, down, "down"))
117+
return results

0 commit comments

Comments
 (0)