From 50bea4006f174a3549e670fd3ea622f8e31083cf Mon Sep 17 00:00:00 2001 From: Eduard Kerkhoven Date: Fri, 29 May 2026 22:46:42 +0200 Subject: [PATCH 1/3] Add the foundation utilities: GPR, balance, parse, sort, validate --- src/raven_python/utils/__init__.py | 16 ++++ src/raven_python/utils/balance.py | 89 +++++++++++++++++++++ src/raven_python/utils/gpr.py | 119 +++++++++++++++++++++++++++++ src/raven_python/utils/parse.py | 33 ++++++++ src/raven_python/utils/sort.py | 21 +++++ src/raven_python/utils/validate.py | 86 +++++++++++++++++++++ tests/test_utils_balance.py | 76 ++++++++++++++++++ tests/test_utils_gpr.py | 84 ++++++++++++++++++++ tests/test_utils_sort.py | 42 ++++++++++ tests/test_utils_validate.py | 80 +++++++++++++++++++ 10 files changed, 646 insertions(+) create mode 100644 src/raven_python/utils/__init__.py create mode 100644 src/raven_python/utils/balance.py create mode 100644 src/raven_python/utils/gpr.py create mode 100644 src/raven_python/utils/parse.py create mode 100644 src/raven_python/utils/sort.py create mode 100644 src/raven_python/utils/validate.py create mode 100644 tests/test_utils_balance.py create mode 100644 tests/test_utils_gpr.py create mode 100644 tests/test_utils_sort.py create mode 100644 tests/test_utils_validate.py diff --git a/src/raven_python/utils/__init__.py b/src/raven_python/utils/__init__.py new file mode 100644 index 0000000..7127bdd --- /dev/null +++ b/src/raven_python/utils/__init__.py @@ -0,0 +1,16 @@ +"""Shared helpers — GPR linting, elemental balance, model curation checks, id sorting.""" +from raven_python.utils.balance import ElementalBalance, get_elemental_balance +from raven_python.utils.gpr import GPRIssue, find_non_dnf_grrules, is_dnf +from raven_python.utils.sort import sort_identifiers +from raven_python.utils.validate import ModelIssue, check_model + +__all__ = [ + "ElementalBalance", + "GPRIssue", + "ModelIssue", + "check_model", + "find_non_dnf_grrules", + "get_elemental_balance", + "is_dnf", + "sort_identifiers", +] diff --git a/src/raven_python/utils/balance.py b/src/raven_python/utils/balance.py new file mode 100644 index 0000000..ee64ab4 --- /dev/null +++ b/src/raven_python/utils/balance.py @@ -0,0 +1,89 @@ +"""Check the elemental balance of reactions, distinguishing *unbalanced* from +*unknown* (missing formula). + +cobra's ``reaction.check_mass_balance()`` silently treats a missing formula as +empty, so a reaction can look "unbalanced" — or even balanced — when the truth is +that the data is incomplete. This module checks for missing formulas first and +returns a graded status +per reaction (``balanced`` / ``unbalanced`` / ``unknown``) plus the element +imbalance — over a batch, as structured data. +""" +from __future__ import annotations + +from dataclasses import dataclass, field + +import cobra + + +@dataclass(frozen=True) +class ElementalBalance: + """Balance result for one reaction. + + Attributes + ---------- + reaction_id + ID of the reaction. + status + ``"balanced"`` — elements balance; + ``"unbalanced"`` — they do not (see ``imbalance``); + ``"unknown"`` — at least one metabolite has no formula, so it cannot be + determined (cobra would silently miscount these). + imbalance + Element → net coefficient (products − reactants), only for + ``"unbalanced"``; empty otherwise. Charge is not included. + """ + + reaction_id: str + status: str + imbalance: dict[str, float] = field(default_factory=dict) + + +def get_elemental_balance( + model: cobra.Model, reactions=None +) -> list[ElementalBalance]: + """Check whether reactions are elementally balanced. + Parameters + ---------- + reactions + Reaction IDs/objects to check; default all reactions. (Boundary + reactions exchange mass with the environment and will read as + ``unbalanced`` — filter them out if that is not wanted.) + + Returns + ------- + list of ElementalBalance + One entry per checked reaction, in model order. + """ + if reactions is None: + rxns = list(model.reactions) + else: + if isinstance(reactions, (str, cobra.Reaction)): + reactions = [reactions] + rxns = [ + r if isinstance(r, cobra.Reaction) else model.reactions.get_by_id(r) + for r in reactions + ] + + results: list[ElementalBalance] = [] + for rxn in rxns: + if not rxn.metabolites: + # A reaction with no metabolites used to fall through to ``balanced`` + # (vacuously) because ``any()`` over the empty list is False and the + # zero-element imbalance dict is empty. Treat the no-formula case + # (zero formulae present) as ``unknown``: we can't determine balance + # for a reaction without stoichiometry. + results.append(ElementalBalance(rxn.id, "unknown")) + continue + if any(not met.formula for met in rxn.metabolites): + results.append(ElementalBalance(rxn.id, "unknown")) + continue + imbalance = { + element: amount + for element, amount in rxn.check_mass_balance().items() + if element != "charge" + } + if imbalance: + results.append(ElementalBalance(rxn.id, "unbalanced", imbalance)) + else: + results.append(ElementalBalance(rxn.id, "balanced")) + return results diff --git a/src/raven_python/utils/gpr.py b/src/raven_python/utils/gpr.py new file mode 100644 index 0000000..2e2122d --- /dev/null +++ b/src/raven_python/utils/gpr.py @@ -0,0 +1,119 @@ +"""GPR (gene-protein-reaction rule) linting. + +Flag GPRs that are *not* in disjunctive normal form ("OR of AND-complexes"), via cobra's +GPR AST. GPR syntax *normalisation* is already done by cobra on assignment, so it isn't +re-implemented here. + +Part (2) has no cobrapy equivalent and is ported here, reworked onto cobra's +GPR AST instead of RAVEN's brittle substring search. The relevant property is +**disjunctive normal form (DNF)**: an OR of AND-clauses of single genes, e.g. +``(G1 and G2) or G3``. Rules where an AND contains an OR — e.g. +``(G1 or G2) and (G3 or G4)`` — are *valid* for cobra but ambiguous for the +isoenzyme/complex reasoning used across RAVEN/GECKO, and ``expand_model`` +(see :mod:`raven_python.manipulation.expand`) only does something for DNF rules. +:func:`find_non_dnf_grrules` surfaces them as structured data rather than, as +RAVEN did, only printing a warning. +""" +from __future__ import annotations + +import ast +from dataclasses import dataclass + +import cobra +from cobra.core.gene import GPR + + +def _contains_or(node: ast.AST | None) -> bool: + """True if ``node``'s subtree contains an OR operator anywhere.""" + if isinstance(node, ast.BoolOp): + if isinstance(node.op, ast.Or): + return True + return any(_contains_or(value) for value in node.values) + return False + + +def _is_dnf_node(node: ast.AST | None) -> bool: + """True if the AST rooted at ``node`` is in disjunctive normal form. + + DNF here means no AND operator has an OR anywhere beneath it, i.e. the + rule is a single gene, a pure AND-complex, or an OR of those. + """ + if node is None or isinstance(node, ast.Name): + return True + if isinstance(node, ast.BoolOp): + if isinstance(node.op, ast.And): + return not any(_contains_or(value) for value in node.values) + # OR: every disjunct must itself be DNF + return all(_is_dnf_node(value) for value in node.values) + # Unknown node type: don't flag it as a problem. + return True + + +def is_dnf(gpr: GPR | str | None) -> bool: + """Return whether a GPR is in disjunctive normal form (OR of AND-complexes). + + Parameters + ---------- + gpr + A cobra :class:`~cobra.core.gene.GPR`, a grRule string, or ``None``. + An empty/``None`` rule is trivially DNF. + + Examples + -------- + >>> is_dnf("(G1 and G2) or G3") + True + >>> is_dnf("(G1 or G2) and G3") + False + """ + if isinstance(gpr, str): + gpr = GPR.from_string(gpr) + if gpr is None: + return True + return _is_dnf_node(gpr.body) + + +@dataclass(frozen=True) +class GPRIssue: + """A reaction whose GPR is flagged by the linter. + + Attributes + ---------- + reaction_id + ID of the reaction. + gpr + The (already cobra-normalised) grRule string. + reason + Human-readable explanation of why it was flagged. + """ + + reaction_id: str + gpr: str + reason: str + + +_NON_DNF_REASON = ( + "GPR is not in disjunctive normal form (an AND clause contains an OR). " + "Isoenzyme/complex reasoning and expand_model assume an OR of AND-complexes, " + 'e.g. rewrite "(G1 or G2) and (G3 or G4)" as ' + '"(G1 and G3) or (G1 and G4) or (G2 and G3) or (G2 and G4)".' +) + + +def find_non_dnf_grrules(model: cobra.Model) -> list[GPRIssue]: + """Find reactions whose GPR is not in disjunctive normal form ("OR of AND-complexes"). + + Uses cobra's GPR AST. Reactions with no GPR are skipped. + + Returns + ------- + list of GPRIssue + One entry per flagged reaction, in model reaction order. Empty if all + GPRs are simple OR-of-AND-complexes. + """ + issues: list[GPRIssue] = [] + for rxn in model.reactions: + if not rxn.gene_reaction_rule: + continue + if not is_dnf(rxn.gpr): + issues.append(GPRIssue(rxn.id, rxn.gene_reaction_rule, _NON_DNF_REASON)) + return issues diff --git a/src/raven_python/utils/parse.py b/src/raven_python/utils/parse.py new file mode 100644 index 0000000..8068f6c --- /dev/null +++ b/src/raven_python/utils/parse.py @@ -0,0 +1,33 @@ +"""Small parsing helpers shared across raven_python.""" +from __future__ import annotations + +import re + +# A metabolite written as ``name[comp]``. The name is greedy so that, for a +# pathological name that itself contains brackets, the *last* ``[...]`` is taken +# as the compartment (matching RAVEN getIndexes' ``max(strfind('['))`` rule). +_NAME_COMP_RE = re.compile(r"^(?P.+)\[(?P[^\[\]]+)\]$") + + +def parse_name_comp(token: str) -> tuple[str, str | None]: + """Split a ``name[comp]`` token into ``(name, compartment)``. + + This is the one genuinely cobra-absent sliver of RAVEN ``getIndexes``' + ``metcomps`` mode and ``addRxns`` eqnType 3: resolving a metabolite written + as its *name* plus a compartment in square brackets, e.g. ``"ATP[c]"``. + + Returns ``(name, None)`` when there is no trailing ``[...]``. + + Examples + -------- + >>> parse_name_comp("ATP[c]") + ('ATP', 'c') + >>> parse_name_comp("ATP") + ('ATP', None) + >>> parse_name_comp("weird[name][m]") + ('weird[name]', 'm') + """ + match = _NAME_COMP_RE.match(token.strip()) + if match: + return match.group("name").strip(), match.group("comp").strip() + return token.strip(), None diff --git a/src/raven_python/utils/sort.py b/src/raven_python/utils/sort.py new file mode 100644 index 0000000..a8641a8 --- /dev/null +++ b/src/raven_python/utils/sort.py @@ -0,0 +1,21 @@ +"""Sort a model's identifiers alphabetically — useful for deterministic, +diff-friendly output. + +cobra's ``DictList.sort`` reorders one list (and rebuilds its lookup index), but +there is no single "sort the whole model" call; this provides it. +""" +from __future__ import annotations + +import cobra + + +def sort_identifiers(model: cobra.Model) -> cobra.Model: + """Sort reactions, metabolites and genes alphabetically by ID, in place. + + Returns the same (mutated) model for convenience. Compartments are a plain + dict and are emitted sorted by writers as needed. + """ + model.reactions.sort(key=lambda r: r.id) + model.metabolites.sort(key=lambda m: m.id) + model.genes.sort(key=lambda g: g.id) + return model diff --git a/src/raven_python/utils/validate.py b/src/raven_python/utils/validate.py new file mode 100644 index 0000000..c08df48 --- /dev/null +++ b/src/raven_python/utils/validate.py @@ -0,0 +1,86 @@ +"""Curation checks for a model. + +A QC bundle cobra has no single call for: orphaned objects, empty reactions, +duplicated metabolite ``name + compartment``, empty names, and objective sanity. +:func:`check_model` returns these as structured :class:`ModelIssue` records. +""" +from __future__ import annotations + +from dataclasses import dataclass + +import cobra + + +@dataclass(frozen=True) +class ModelIssue: + """One curation issue found in a model. + + Attributes + ---------- + category + Machine-readable kind, e.g. ``"orphan_metabolite"``, ``"empty_reaction"``, + ``"orphan_gene"``, ``"duplicate_name_compartment"``, + ``"empty_metabolite_name"``, ``"objective"``. + object_id + ID of the offending object, or ``None`` for model-level issues. + message + Human-readable description. + """ + + category: str + object_id: str | None + message: str + + +def check_model(model: cobra.Model) -> list[ModelIssue]: + """Run curation checks on a model and return the issues found. + + Does not + raise; returns a (possibly empty) list of :class:`ModelIssue`. + """ + issues: list[ModelIssue] = [] + + for met in model.metabolites: + if not met.reactions: + issues.append( + ModelIssue("orphan_metabolite", met.id, f"Metabolite {met.id!r} is not used in any reaction.") + ) + if not (met.name and str(met.name).strip()): + issues.append( + ModelIssue("empty_metabolite_name", met.id, f"Metabolite {met.id!r} has no name.") + ) + + for gene in model.genes: + if not gene.reactions: + issues.append( + ModelIssue("orphan_gene", gene.id, f"Gene {gene.id!r} is not associated with any reaction.") + ) + + for rxn in model.reactions: + if not rxn.metabolites: + issues.append( + ModelIssue("empty_reaction", rxn.id, f"Reaction {rxn.id!r} has no metabolites.") + ) + + by_name_comp: dict[tuple[str, str], list[str]] = {} + for met in model.metabolites: + by_name_comp.setdefault((met.name, met.compartment), []).append(met.id) + for (name, comp), ids in by_name_comp.items(): + if name and len(ids) > 1: + issues.append( + ModelIssue( + "duplicate_name_compartment", + None, + f"{len(ids)} metabolites share name {name!r} in compartment {comp!r}: {sorted(ids)}", + ) + ) + + objective_rxns = [r.id for r in model.reactions if r.objective_coefficient != 0] + if not objective_rxns: + issues.append(ModelIssue("objective", None, "No reaction has a nonzero objective coefficient.")) + elif len(objective_rxns) > 1: + issues.append( + ModelIssue("objective", None, f"Multiple objective reactions: {sorted(objective_rxns)}") + ) + + return issues diff --git a/tests/test_utils_balance.py b/tests/test_utils_balance.py new file mode 100644 index 0000000..aa1e47e --- /dev/null +++ b/tests/test_utils_balance.py @@ -0,0 +1,76 @@ +"""Tests for get_elemental_balance (getElementalBalance port).""" +import cobra +import pytest + +from raven_python.utils import ElementalBalance, get_elemental_balance + + +@pytest.fixture +def model(): + m = cobra.Model("t") + m.add_metabolites( + [ + cobra.Metabolite("a_c", formula="C6H12O6", charge=0, compartment="c"), + cobra.Metabolite("b_c", formula="C6H12O6", charge=0, compartment="c"), + cobra.Metabolite("c_c", formula="C3H6O3", charge=0, compartment="c"), + cobra.Metabolite("n_c", compartment="c"), # no formula + ] + ) + r_bal = cobra.Reaction("R_bal") + m.add_reactions([r_bal]) + r_bal.build_reaction_from_string("a_c --> b_c") # C6H12O6 -> C6H12O6 + r_unbal = cobra.Reaction("R_unbal") + m.add_reactions([r_unbal]) + r_unbal.build_reaction_from_string("a_c --> c_c") # C6H12O6 -> C3H6O3 + r_unknown = cobra.Reaction("R_unknown") + m.add_reactions([r_unknown]) + r_unknown.build_reaction_from_string("a_c --> n_c") # n_c has no formula + return m + + +def test_balanced(model): + (res,) = get_elemental_balance(model, ["R_bal"]) + assert res == ElementalBalance("R_bal", "balanced", {}) + + +def test_unbalanced_reports_imbalance(model): + (res,) = get_elemental_balance(model, ["R_unbal"]) + assert res.status == "unbalanced" + # products - reactants: C3H6O3 - C6H12O6 = -C3H6O3 + assert res.imbalance == {"C": -3.0, "H": -6.0, "O": -3.0} + + +def test_missing_formula_is_unknown_not_silently_wrong(model): + # cobra's check_mass_balance alone would silently report an imbalance here; + # we flag it as unknown instead. + (res,) = get_elemental_balance(model, ["R_unknown"]) + assert res.status == "unknown" + assert res.imbalance == {} + + +def test_all_reactions_default(model): + results = get_elemental_balance(model) + assert {r.reaction_id: r.status for r in results} == { + "R_bal": "balanced", + "R_unbal": "unbalanced", + "R_unknown": "unknown", + } + + +def test_charge_excluded(model): + # give a charge imbalance but keep elements balanced -> still "balanced" + model.metabolites.get_by_id("b_c").charge = 1 + (res,) = get_elemental_balance(model, ["R_bal"]) + assert res.status == "balanced" + + +# --- regression: empty reaction → unknown (known_issues.md F5) ------------- + +def test_empty_reaction_is_unknown(model): + """A reaction with no metabolites used to be reported `balanced` + vacuously (any() over an empty list is False and check_mass_balance + returns no imbalance). Now reports `unknown`.""" + empty = cobra.Reaction("R_empty", lower_bound=0, upper_bound=1000) + model.add_reactions([empty]) + (res,) = get_elemental_balance(model, ["R_empty"]) + assert res.status == "unknown" diff --git a/tests/test_utils_gpr.py b/tests/test_utils_gpr.py new file mode 100644 index 0000000..275d020 --- /dev/null +++ b/tests/test_utils_gpr.py @@ -0,0 +1,84 @@ +"""Tests for raven_python.utils.gpr (GPR linting).""" +import cobra +import pytest + +from raven_python.utils import GPRIssue, find_non_dnf_grrules, is_dnf + + +@pytest.mark.parametrize( + "rule", + [ + "", + "G1", + "G1 and G2", + "G1 or G2", + "G1 and G2 and G3", + "G1 or G2 or G3", + "(G1 and G2) or G3", + "(G1 and G2) or (G3 and G4)", + "G1 or (G2 and G3)", + ], +) +def test_is_dnf_true(rule): + assert is_dnf(rule) is True + + +@pytest.mark.parametrize( + "rule", + [ + "(G1 or G2) and G3", + "G1 and (G2 or G3)", + "(G1 or G2) and (G3 or G4)", + "G1 and (G2 or (G3 and G4))", + ], +) +def test_is_dnf_false(rule): + assert is_dnf(rule) is False + + +def test_is_dnf_accepts_gpr_and_none(): + from cobra.core.gene import GPR + + assert is_dnf(GPR.from_string("(G1 or G2) and G3")) is False + assert is_dnf(GPR.from_string("G1 or G2")) is True + assert is_dnf(None) is True + + +def test_is_dnf_independent_of_formatting(): + # cobra normalises on assignment, so casing/whitespace cannot change the verdict. + assert is_dnf("(G1 OR G2) AND G3") is False + assert is_dnf("( G1 and G2 ) or G3") is True + + +def _model_with_rules(rules: dict[str, str]) -> cobra.Model: + model = cobra.Model("t") + model.add_reactions([cobra.Reaction(rid) for rid in rules]) + for rid, rule in rules.items(): + model.reactions.get_by_id(rid).gene_reaction_rule = rule + return model + + +def test_find_non_dnf_grrules_flags_only_offenders(): + model = _model_with_rules( + { + "R_ok_single": "G1", + "R_ok_complex": "G1 and G2", + "R_ok_dnf": "(G1 and G2) or G3", + "R_no_gpr": "", + "R_bad_1": "(G1 or G2) and G3", + "R_bad_2": "(G1 or G2) and (G3 or G4)", + } + ) + + issues = find_non_dnf_grrules(model) + + assert [i.reaction_id for i in issues] == ["R_bad_1", "R_bad_2"] + assert all(isinstance(i, GPRIssue) for i in issues) + assert all("disjunctive normal form" in i.reason for i in issues) + # the reported GPR is the cobra-normalised string + assert issues[0].gpr == "(G1 or G2) and G3" + + +def test_find_non_dnf_grrules_empty_when_all_clean(): + model = _model_with_rules({"R1": "G1 or G2", "R2": "(G1 and G2) or G3"}) + assert find_non_dnf_grrules(model) == [] diff --git a/tests/test_utils_sort.py b/tests/test_utils_sort.py new file mode 100644 index 0000000..18bca24 --- /dev/null +++ b/tests/test_utils_sort.py @@ -0,0 +1,42 @@ +"""Tests for sort_identifiers and write_yaml_model(sort_ids=True).""" +import cobra + +from raven_python.io import read_yaml_model, write_yaml_model +from raven_python.manipulation import add_reactions_from_equations +from raven_python.utils import sort_identifiers + + +def _model(): + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("b_c", "a_c")]) + add_reactions_from_equations( + m, + [ + {"id": "R2", "equation": "a_c --> b_c", "gene_reaction_rule": "GB"}, + {"id": "R1", "equation": "b_c --> a_c", "gene_reaction_rule": "GA"}, + ], + ) + return m + + +def test_sort_identifiers_orders_everything(): + m = _model() + sort_identifiers(m) + assert [r.id for r in m.reactions] == ["R1", "R2"] + assert [x.id for x in m.metabolites] == ["a_c", "b_c"] + assert [g.id for g in m.genes] == ["GA", "GB"] + # lookup index still intact after sorting + assert m.reactions.get_by_id("R2").id == "R2" + + +def test_write_yaml_sort_ids_does_not_mutate(tmp_path): + m = _model() + order_before = [r.id for r in m.reactions] + out = tmp_path / "m.yml" + write_yaml_model(m, out, sort_ids=True) + assert [r.id for r in m.reactions] == order_before # model untouched + # but the file is sorted + text = out.read_text() + assert text.index("R1") < text.index("R2") + reloaded = read_yaml_model(out) + assert [r.id for r in reloaded.reactions] == ["R1", "R2"] diff --git a/tests/test_utils_validate.py b/tests/test_utils_validate.py new file mode 100644 index 0000000..2d38e6f --- /dev/null +++ b/tests/test_utils_validate.py @@ -0,0 +1,80 @@ +"""Tests for check_model (the surviving checks of checkModelStruct).""" +import cobra +import pytest + +from raven_python.manipulation import add_reactions_from_equations +from raven_python.utils import ModelIssue, check_model + + +def _categories(issues, category): + return [i.object_id for i in issues if i.category == category] + + +@pytest.fixture +def model(): + m = cobra.Model("t") + m.add_metabolites( + [ + cobra.Metabolite("a_c", name="A", compartment="c"), + cobra.Metabolite("b_c", name="B", compartment="c"), + ] + ) + add_reactions_from_equations( + m, [{"id": "R1", "equation": "a_c --> b_c", "gene_reaction_rule": "G1"}] + ) + m.reactions.get_by_id("R1").objective_coefficient = 1 + return m + + +def test_clean_model_has_no_issues(model): + assert check_model(model) == [] + + +def test_orphan_metabolite(model): + model.add_metabolites([cobra.Metabolite("orphan_c", name="Orphan", compartment="c")]) + assert "orphan_c" in _categories(check_model(model), "orphan_metabolite") + + +def test_orphan_gene(model): + model.genes.append(cobra.core.gene.Gene("G_lonely")) + assert "G_lonely" in _categories(check_model(model), "orphan_gene") + + +def test_empty_reaction(model): + model.add_reactions([cobra.Reaction("R_empty")]) + assert "R_empty" in _categories(check_model(model), "empty_reaction") + + +def test_empty_metabolite_name(model): + model.add_metabolites([cobra.Metabolite("noname_c", compartment="c")]) + # also an orphan, but we check the name category specifically + assert "noname_c" in _categories(check_model(model), "empty_metabolite_name") + + +def test_duplicate_name_compartment(model): + # second metabolite named "A" in compartment c + dup = cobra.Metabolite("a2_c", name="A", compartment="c") + model.add_metabolites([dup]) + model.reactions.get_by_id("R1").add_metabolites({dup: -1}) # keep it used + issues = [i for i in check_model(model) if i.category == "duplicate_name_compartment"] + assert len(issues) == 1 + assert "a_c" in issues[0].message and "a2_c" in issues[0].message + + +def test_no_objective(model): + model.reactions.get_by_id("R1").objective_coefficient = 0 + cats = [i.category for i in check_model(model)] + assert "objective" in cats + + +def test_multiple_objectives(model): + add_reactions_from_equations(model, [{"id": "R2", "equation": "b_c --> a_c"}]) + model.reactions.get_by_id("R2").objective_coefficient = 1 + obj_issues = [i for i in check_model(model) if i.category == "objective"] + assert len(obj_issues) == 1 + assert "Multiple" in obj_issues[0].message + + +def test_returns_model_issue_instances(model): + model.add_reactions([cobra.Reaction("R_empty")]) + assert all(isinstance(i, ModelIssue) for i in check_model(model)) From 4bc0d6ee29070a0bad4e347a78947c7fae8199a0 Mon Sep 17 00:00:00 2001 From: Eduard Kerkhoven Date: Fri, 29 May 2026 22:46:59 +0200 Subject: [PATCH 2/3] Add the model-manipulation layer (add, remove, transport, merge, etc.) --- src/raven_python/manipulation/__init__.py | 36 ++ src/raven_python/manipulation/add.py | 345 ++++++++++++++++++ src/raven_python/manipulation/change.py | 125 +++++++ src/raven_python/manipulation/compartments.py | 196 ++++++++++ src/raven_python/manipulation/expand.py | 124 +++++++ src/raven_python/manipulation/irreversible.py | 72 ++++ src/raven_python/manipulation/merge.py | 146 ++++++++ src/raven_python/manipulation/parameters.py | 78 ++++ src/raven_python/manipulation/remove.py | 120 ++++++ src/raven_python/manipulation/simplify.py | 229 ++++++++++++ src/raven_python/manipulation/transfer.py | 144 ++++++++ src/raven_python/manipulation/transport.py | 157 ++++++++ tests/test_change_grrules.py | 49 +++ tests/test_manipulation_add.py | 278 ++++++++++++++ tests/test_manipulation_change.py | 93 +++++ tests/test_manipulation_compartments.py | 139 +++++++ tests/test_manipulation_expand.py | 288 +++++++++++++++ tests/test_manipulation_irreversible.py | 144 ++++++++ tests/test_manipulation_merge.py | 136 +++++++ tests/test_manipulation_remove.py | 97 +++++ tests/test_manipulation_simplify.py | 184 ++++++++++ tests/test_manipulation_transfer.py | 137 +++++++ tests/test_manipulation_transport.py | 98 +++++ tests/test_parameters.py | 60 +++ 24 files changed, 3475 insertions(+) create mode 100644 src/raven_python/manipulation/__init__.py create mode 100644 src/raven_python/manipulation/add.py create mode 100644 src/raven_python/manipulation/change.py create mode 100644 src/raven_python/manipulation/compartments.py create mode 100644 src/raven_python/manipulation/expand.py create mode 100644 src/raven_python/manipulation/irreversible.py create mode 100644 src/raven_python/manipulation/merge.py create mode 100644 src/raven_python/manipulation/parameters.py create mode 100644 src/raven_python/manipulation/remove.py create mode 100644 src/raven_python/manipulation/simplify.py create mode 100644 src/raven_python/manipulation/transfer.py create mode 100644 src/raven_python/manipulation/transport.py create mode 100644 tests/test_change_grrules.py create mode 100644 tests/test_manipulation_add.py create mode 100644 tests/test_manipulation_change.py create mode 100644 tests/test_manipulation_compartments.py create mode 100644 tests/test_manipulation_expand.py create mode 100644 tests/test_manipulation_irreversible.py create mode 100644 tests/test_manipulation_merge.py create mode 100644 tests/test_manipulation_remove.py create mode 100644 tests/test_manipulation_simplify.py create mode 100644 tests/test_manipulation_transfer.py create mode 100644 tests/test_manipulation_transport.py create mode 100644 tests/test_parameters.py diff --git a/src/raven_python/manipulation/__init__.py b/src/raven_python/manipulation/__init__.py new file mode 100644 index 0000000..074c36f --- /dev/null +++ b/src/raven_python/manipulation/__init__.py @@ -0,0 +1,36 @@ +"""Generic cobra.Model structural transforms that cobrapy does not cover cleanly: +reaction building from equations, batch GPR / bound changes, irreversibility splitting, +isozyme expansion, compartment merge / copy, and model merging by name.""" +from .add import add_reactions_from_equations +from .change import change_gene_reaction_rules, change_reaction_equations +from .expand import expand_model +from .irreversible import convert_to_irreversible +from .merge import merge_models +from .parameters import set_variance_bounds +from .remove import remove_genes, remove_metabolites +from .simplify import ( + constrain_reversible_reactions, + group_linear_reactions, + remove_dead_end_reactions, + remove_duplicate_reactions, +) +from .transfer import add_reactions_from_model +from .transport import add_transport_reactions + +__all__ = [ + "add_reactions_from_equations", + "add_reactions_from_model", + "add_transport_reactions", + "change_gene_reaction_rules", + "change_reaction_equations", + "constrain_reversible_reactions", + "convert_to_irreversible", + "expand_model", + "group_linear_reactions", + "merge_models", + "remove_dead_end_reactions", + "remove_duplicate_reactions", + "remove_genes", + "remove_metabolites", + "set_variance_bounds", +] diff --git a/src/raven_python/manipulation/add.py b/src/raven_python/manipulation/add.py new file mode 100644 index 0000000..3842297 --- /dev/null +++ b/src/raven_python/manipulation/add.py @@ -0,0 +1,345 @@ +"""Add reactions to a model from equation strings. + +Most of the equivalent MATLAB code is struct-of-arrays bookkeeping (padding parallel +``rxnNames`` / ``lb`` / ``ub`` / ``grRules`` / ... fields) that does not exist in +cobra, where each ``Reaction`` carries its own attributes. cobra also already +covers a large part of the *behaviour*: + +* ``Reaction.build_reaction_from_string`` parses equation strings, coefficients, + and reversibility arrows (``<=>``, ``-->``, ``=>``) and creates unknown + metabolites — but only matching metabolites **by ID**, and it leaves new + metabolites with ``compartment=None``. +* assigning ``reaction.gene_reaction_rule`` auto-creates ``Gene`` objects. + +So this port keeps only the parts cobra lacks: + +* **name-based matching** — interpret equation tokens as metabolite *names* + (RAVEN eqnType 2) or as ``name[comp]`` (eqnType 3), not just IDs; +* **correct compartment** assignment for newly created metabolites; +* **strict policies** — optionally *error* (rather than silently create) on + unknown metabolites or genes, and always error on a duplicate reaction ID + (cobra silently ignores those). + +Instead of RAVEN's ``eqnType`` integer (1/2/3) the matching mode is a readable +keyword: ``mets_by="id"`` or ``mets_by="name"``, with ``name[comp]`` recognised +automatically. See IMPROVEMENTS.md (A-series) for the rationale. +""" +from __future__ import annotations + +import re +import warnings +from collections import OrderedDict +from collections.abc import Mapping, Sequence + +import cobra +from cobra import Metabolite, Reaction +from cobra.core.gene import GPR + +from raven_python.utils.parse import parse_name_comp + +# Reversibility arrows. ``<=>`` must be tried before ``=>`` (it contains it). +_REVERSIBLE_ARROWS = ("<=>",) +_FORWARD_ARROWS = ("-->", "->", "=>") + + +def _split_equation(equation: str) -> tuple[str, str, bool]: + """Split an equation into (lhs, rhs, reversible) on its arrow.""" + for arrow in _REVERSIBLE_ARROWS: + if arrow in equation: + lhs, rhs = equation.split(arrow, 1) + return lhs, rhs, True + for arrow in _FORWARD_ARROWS: + if arrow in equation: + lhs, rhs = equation.split(arrow, 1) + return lhs, rhs, False + raise ValueError(f"No reaction arrow (<=>, -->, =>) found in equation: {equation!r}") + + +def _parse_side(side: str) -> list[tuple[float, str, str | None]]: + """Parse one side of an equation into ``[(coefficient, token, fallback), ...]``. + + The ``fallback`` slot is for the ambiguous ``" "`` shape: when + matching by name, ``"2 oxoglutarate"`` could be either ``coeff=2, name="oxoglutarate"`` + or ``coeff=1, name="2 oxoglutarate"`` (a real chemistry name). We return the + coefficient-split form as the primary and the full term as the fallback; the + resolver picks whichever matches an existing metabolite. Pure-number heads + with no name (``"2"``) and pure-name terms (``"glucose"``) have no fallback. + """ + terms: list[tuple[float, str, str | None]] = [] + for raw in side.split(" + "): + term = raw.strip() + if not term: + continue + head, _, tail = term.partition(" ") + try: + coeff = float(head) + token = tail.strip() + except ValueError: + coeff, token = 1.0, term + fallback = None + else: + # Coefficient-split succeeded. Keep the full term as a fallback when + # the tail is non-empty so name-resolution can re-try it as one token. + fallback = term if token else None + if not token: + raise ValueError(f"Missing metabolite after coefficient in term: {raw!r}") + terms.append((coeff, token, fallback)) + return terms + + +def _new_met_id(model: cobra.Model, prefix: str) -> str: + """Next free ```` metabolite ID (RAVEN m1, m2, ... scheme).""" + pattern = re.compile(rf"^{re.escape(prefix)}(\d+)$") + used = [int(m.group(1)) for met in model.metabolites if (m := pattern.match(met.id))] + n = max(used) + 1 if used else 1 + while f"{prefix}{n}" in model.metabolites: + n += 1 + return f"{prefix}{n}" + + +def _try_existing( + model: cobra.Model, token: str, *, mets_by: str, compartment: str | None +) -> Metabolite | None: + """Look up ``token`` as an existing metabolite (no creation, no side effects). + + Returns the matching metabolite or ``None``. Used by ``_stoichiometry`` to + disambiguate the ``" "`` shape: if a metabolite whose *name* + (or id) literally contains a leading number exists, prefer it over splitting + the number off as a coefficient. + """ + name, comp = parse_name_comp(token) + if mets_by == "id" and comp is None: + return model.metabolites.get_by_id(token) if token in model.metabolites else None + target_comp = comp if comp is not None else compartment + if target_comp is None: + return None + for met in model.metabolites: + if met.name == name and met.compartment == target_comp: + return met + return None + + +def _resolve_metabolite( + model: cobra.Model, + token: str, + *, + mets_by: str, + compartment: str | None, + allow_new_mets: bool, + new_met_prefix: str, +) -> Metabolite: + """Resolve an equation token to an existing or newly created Metabolite.""" + name, comp = parse_name_comp(token) + + if mets_by == "id" and comp is None: + # token is a metabolite ID + if token in model.metabolites: + return model.metabolites.get_by_id(token) + if not allow_new_mets: + raise ValueError( + f"Unknown metabolite ID {token!r}; pass allow_new_mets=True to create it." + ) + if compartment is None: + raise ValueError( + f"Cannot create metabolite {token!r}: no compartment given." + ) + _warn_unknown_compartment(model, compartment, token) + met = Metabolite(token, compartment=compartment) + model.add_metabolites([met]) + return met + + # name-based (mets_by="name") or explicit name[comp] + target_comp = comp if comp is not None else compartment + if target_comp is None: + raise ValueError( + f"Metabolite {token!r} matched by name needs a compartment; " + "pass compartment=... or use the name[comp] syntax." + ) + if comp is not None and target_comp not in model.compartments and not allow_new_mets: + raise ValueError(f"Compartment {target_comp!r} is not in the model.") + + matches = [ + met + for met in model.metabolites + if met.name == name and met.compartment == target_comp + ] + if matches: + return matches[0] + if not allow_new_mets: + raise ValueError( + f"No metabolite named {name!r} in compartment {target_comp!r}; " + "pass allow_new_mets=True to create it." + ) + _warn_unknown_compartment(model, target_comp, name) + met = Metabolite(_new_met_id(model, new_met_prefix), name=name, compartment=target_comp) + model.add_metabolites([met]) + return met + + +def _warn_unknown_compartment(model: cobra.Model, compartment: str, identifier: str) -> None: + """Warn when a new metabolite would be born into a not-yet-registered compartment. + + Both ``mets_by`` paths previously created the metabolite without validating + the compartment, so a typo (``"cyto"`` for ``"c"``) silently produced a + one-metabolite ghost compartment. cobra inherits the compartment from the + first metabolite assigned to it, so the fix is a warning, not a hard error. + """ + known = set(model.compartments) | set(model._compartments) + if compartment not in known: + warnings.warn( + f"Creating metabolite {identifier!r} in unregistered compartment " + f"{compartment!r} (existing: {sorted(known) or 'none'}); " + "add the compartment first or check for a typo.", + stacklevel=5, + ) + + +def _stoichiometry( + model: cobra.Model, + equation: str, + *, + mets_by: str, + compartment: str | None, + allow_new_mets: bool, + new_met_prefix: str, +) -> tuple[dict[Metabolite, float], bool]: + """Parse an equation into a {Metabolite: net coefficient} dict + reversibility.""" + lhs, rhs, reversible = _split_equation(equation) + coeffs: OrderedDict[Metabolite, float] = OrderedDict() + had_terms = False + for sign, side in ((-1.0, lhs), (1.0, rhs)): + for coeff, token, fallback in _parse_side(side): + had_terms = True + # " " is ambiguous when the name itself starts with a + # number (e.g. "2 oxoglutarate"). Prefer the full-term interpretation + # when it matches an existing metabolite — otherwise fall through to + # the coefficient-split form. + met = None + if fallback is not None: + met = _try_existing( + model, fallback, mets_by=mets_by, compartment=compartment + ) + if met is not None: + coeff = 1.0 + if met is None: + met = _resolve_metabolite( + model, + token, + mets_by=mets_by, + compartment=compartment, + allow_new_mets=allow_new_mets, + new_met_prefix=new_met_prefix, + ) + coeffs[met] = coeffs.get(met, 0.0) + sign * coeff + # Drop metabolites that net to zero (present as both substrate and product). + coeffs = OrderedDict((met, c) for met, c in coeffs.items() if c != 0.0) + if had_terms and not coeffs: + warnings.warn( + f"Equation {equation!r} has no net metabolites (all terms cancelled); " + "the reaction will be added with empty stoichiometry.", + stacklevel=4, + ) + return dict(coeffs), reversible + + +def add_reactions_from_equations( + model: cobra.Model, + reactions: Sequence[Mapping], + *, + mets_by: str = "id", + compartment: str | None = None, + allow_new_mets: bool = True, + allow_new_genes: bool = True, + new_met_prefix: str = "m", +) -> list[Reaction]: + """Add reactions defined by equation strings, matching mets by ID or name. + Parameters + ---------- + model + Target ``cobra.Model``, mutated in place. + reactions + Sequence of mappings, one per reaction. Recognised keys: + + * ``id`` (**required**) — reaction ID; must not already exist. + * ``equation`` (**required**) — e.g. ``"atp_c + h2o_c <=> adp_c + pi_c"``. + Use ``<=>`` for reversible, ``-->``/``->``/``=>`` for irreversible. + * ``name`` — reaction name. + * ``bounds`` — ``(lower, upper)`` tuple; overrides the arrow. + * ``gene_reaction_rule`` — GPR string. + * ``subsystem`` — subsystem name. + mets_by + How bare equation tokens (without ``[comp]``) are matched: + ``"id"`` (RAVEN eqnType 1) or ``"name"`` (eqnType 2). A ``name[comp]`` + token (eqnType 3) is always matched by name + compartment. + compartment + Default compartment for new metabolites and for name-matched tokens + without an explicit ``[comp]``. + allow_new_mets + If True (default), create metabolites not found. New metabolites get + ``compartment`` (id mode) or an auto ID ``m1``, ``m2``, ... (name mode). + If False, an unknown metabolite raises. + allow_new_genes + If True (default), genes in a GPR are auto-created by cobra. If False, + a GPR referencing a gene not already in the model raises. + new_met_prefix + Prefix for auto-generated metabolite IDs in name mode (default ``"m"``). + + Returns + ------- + list of cobra.Reaction + The reactions added, in input order. + """ + if mets_by not in ("id", "name"): + raise ValueError(f"mets_by must be 'id' or 'name', got {mets_by!r}") + + known_genes = {gene.id for gene in model.genes} + added: list[Reaction] = [] + + for spec in reactions: + if "id" not in spec: + raise ValueError(f"Reaction spec missing required 'id': {spec!r}") + rxn_id = spec["id"] + if rxn_id in model.reactions: + raise ValueError( + f"Reaction {rxn_id!r} already exists; use changeRxns or remove it first." + ) + if "equation" not in spec: + raise ValueError(f"Reaction {rxn_id!r} spec missing required 'equation'.") + + coeffs, reversible = _stoichiometry( + model, + spec["equation"], + mets_by=mets_by, + compartment=compartment, + allow_new_mets=allow_new_mets, + new_met_prefix=new_met_prefix, + ) + + rxn = Reaction(rxn_id, name=spec.get("name", "")) + if "bounds" in spec: + rxn.bounds = tuple(spec["bounds"]) + else: + config = cobra.Configuration() + lower = config.lower_bound if reversible else 0.0 + rxn.bounds = (lower, config.upper_bound) + if "subsystem" in spec: + rxn.subsystem = spec["subsystem"] + + model.add_reactions([rxn]) + rxn.add_metabolites(coeffs) + + rule = spec.get("gene_reaction_rule", "") + if rule: + if not allow_new_genes: + missing = sorted(set(GPR.from_string(rule).genes) - known_genes) + if missing: + raise ValueError( + f"Reaction {rxn_id!r} references genes not in the model: " + f"{missing}. Set allow_new_genes=True or add them first." + ) + rxn.gene_reaction_rule = rule + known_genes.update(gene.id for gene in rxn.genes) + + added.append(rxn) + + return added diff --git a/src/raven_python/manipulation/change.py b/src/raven_python/manipulation/change.py new file mode 100644 index 0000000..78612ba --- /dev/null +++ b/src/raven_python/manipulation/change.py @@ -0,0 +1,125 @@ +"""Change the stoichiometry of existing reactions from equation strings. + +Editing the same ``Reaction`` object changes only its stoichiometry — its id, name, +bounds, GPR, subsystem, and position are preserved automatically by cobra. + +So this port simply re-parses the equation (reusing the same metabolite +matching as :func:`~raven_python.manipulation.add.add_reactions_from_equations`, +including name and ``name[comp]`` modes that cobra lacks) and swaps the +metabolites in place. + +Like RAVEN, **bounds are left unchanged** even if the new equation's arrow +implies a different reversibility — use a bounds setter for that. +""" +from __future__ import annotations + +from collections.abc import Mapping + +import cobra +from cobra import Reaction + +from raven_python.manipulation.add import _stoichiometry + +__all__ = ["change_reaction_equations", "change_gene_reaction_rules"] + + +def change_reaction_equations( + model: cobra.Model, + equations: Mapping[str, str], + *, + mets_by: str = "id", + compartment: str | None = None, + allow_new_mets: bool = True, + new_met_prefix: str = "m", +) -> list[Reaction]: + """Replace the stoichiometry of existing reactions. + Parameters + ---------- + model + Target ``cobra.Model``, mutated in place. + equations + Mapping of ``reaction_id -> equation string``. Every ID must already + exist in the model. Equation syntax is identical to + :func:`~raven_python.manipulation.add.add_reactions_from_equations`. + mets_by, compartment, allow_new_mets, new_met_prefix + Metabolite-matching options, as in ``add_reactions_from_equations``. + + Returns + ------- + list of cobra.Reaction + The reactions changed, in input order. + + Notes + ----- + Bounds are **not** modified, matching RAVEN. Changing an equation from + ``-->`` to ``<=>`` does not by itself make the reaction reversible; adjust + the bounds separately. + """ + if mets_by not in ("id", "name"): + raise ValueError(f"mets_by must be 'id' or 'name', got {mets_by!r}") + + changed: list[Reaction] = [] + for rxn_id, equation in equations.items(): + if rxn_id not in model.reactions: + raise ValueError(f"Reaction {rxn_id!r} not found in the model.") + rxn = model.reactions.get_by_id(rxn_id) + + coeffs, _reversible = _stoichiometry( + model, + equation, + mets_by=mets_by, + compartment=compartment, + allow_new_mets=allow_new_mets, + new_met_prefix=new_met_prefix, + ) + + rxn.subtract_metabolites(dict(rxn.metabolites), combine=True) + rxn.add_metabolites(coeffs) + changed.append(rxn) + + return changed + + +def change_gene_reaction_rules( + model: cobra.Model, + rules: Mapping[str, str], + *, + replace: bool = True, +) -> list[Reaction]: + """Set or append gene-reaction rules on existing reactions. + cobra already does the heavy lifting on assignment to + ``reaction.gene_reaction_rule``: it auto-creates any new ``Gene`` objects and + normalises the rule. So the value here is batching plus RAVEN's ``replace`` + option to **append** rather than overwrite. + + Parameters + ---------- + model + Target ``cobra.Model``, mutated in place. + rules + Mapping of ``reaction_id -> GPR string``. Every ID must already exist. + replace + If True (default), overwrite the existing GPR. If False, append the new + rule as an isozyme: ``(old) or (new)`` (just ``new`` if the reaction had + no GPR). + + Returns + ------- + list of cobra.Reaction + The reactions changed, in input order. + """ + changed: list[Reaction] = [] + for rxn_id, rule in rules.items(): + if rxn_id not in model.reactions: + raise ValueError(f"Reaction {rxn_id!r} not found in the model.") + rxn = model.reactions.get_by_id(rxn_id) + + if replace or not rxn.gene_reaction_rule: + new_rule = rule + else: + new_rule = f"({rxn.gene_reaction_rule}) or ({rule})" + + rxn.gene_reaction_rule = new_rule # cobra creates genes + normalises + changed.append(rxn) + + return changed diff --git a/src/raven_python/manipulation/compartments.py b/src/raven_python/manipulation/compartments.py new file mode 100644 index 0000000..091d196 --- /dev/null +++ b/src/raven_python/manipulation/compartments.py @@ -0,0 +1,196 @@ +"""Compartment manipulation — merge all compartments into one, or copy reactions to a +new compartment (ports of RAVEN's ``mergeCompartments`` and ``copyToComps``). + +Both functions are useful **independently of** :func:`raven_python.localization.predict_localization`: +``merge_compartments`` flattens a multi-compartment model for a simplified analysis +(e.g. checking whether the network can in principle make a metabolite, with no +compartment topology in the way); ``copy_to_compartment`` is a building block for +constructing dual-localised pathways. cobra has no equivalents. +""" +from __future__ import annotations + +from collections.abc import Iterable + +import cobra + +# Compartments produced by merge_compartments (RAVEN uses 's' for "system"). +_MERGED_COMPARTMENT = "s" + + +def merge_compartments( + model: cobra.Model, + *, + merged_id: str = _MERGED_COMPARTMENT, + merged_name: str = "system", + drop_single_metabolite_reactions: bool = True, + deduplicate_reactions: bool = True, +) -> tuple[cobra.Model, list[str], list[str]]: + """Merge every metabolite of ``model`` into one ``merged_id`` compartment. + + Returns ``(model_copy, deleted_single_met_reactions, deduplicated_reactions)``. The + returned model is a deep copy of the input. Use cases: + + * Check whether the network can produce/consume a metabolite at all (compartment + topology is often what makes a model look blocked). + * Simplify a model for visualisation or an analysis that doesn't care about + compartments. + * As a pre-step for localisation when the user does want RAVEN's + "start from scratch" workflow (call :func:`merge_compartments` then + :func:`raven_python.localization.predict_localization` with the full reaction list). + + Metabolites that already share a base id (e.g. ``glc__D_c`` and ``glc__D_e`` both + map to ``glc__D``) collapse into one entity in the merged compartment; their + stoichiometric contributions are summed per reaction. Reactions that end up with + only one metabolite (e.g. ``A[c] → A[m]`` becomes ``A → A`` = nothing) are deleted + by default (RAVEN's ``deleteRxnsWithOneMet``). Reactions that become identical + after merging are deduplicated (one survives). + """ + out = model.copy() + + # 1. For each metabolite, derive a base id (strip the trailing _). + # Two mets in different compartments sharing the base id collapse to one. + new_to_old: dict[str, list[cobra.Metabolite]] = {} + for m in list(out.metabolites): + base = _base_id(m) + new_to_old.setdefault(base, []).append(m) + + # 2. Build the merged metabolites and rewrite reactions. + canonical: dict[str, cobra.Metabolite] = {} + for base, mets in new_to_old.items(): + proto = mets[0] + new_met = cobra.Metabolite(base, name=proto.name, compartment=merged_id, + formula=proto.formula, charge=proto.charge) + new_met.notes = dict(proto.notes or {}) + canonical[base] = new_met + + # Rewrite all reactions: replace each metabolite with its canonical, summing + # coefficients where multiple original mets collapse to one. + rewritten: dict[str, dict[str, float]] = {} + for r in list(out.reactions): + new_stoich: dict[cobra.Metabolite, float] = {} + for m, coeff in list(r.metabolites.items()): + canon = canonical[_base_id(m)] + new_stoich[canon] = new_stoich.get(canon, 0.0) + coeff + # Drop zero net coefficients (substrate + product of the same base met cancel). + new_stoich = {m: c for m, c in new_stoich.items() if c != 0.0} + rewritten[r.id] = {m.id: c for m, c in new_stoich.items()} + + # Now build a fresh model with the canonical mets + rewritten reactions; the + # cobra in-place rewrite would require careful constraint surgery, so a clean + # rebuild is simpler and less error-prone. + merged = cobra.Model(out.id or "merged") + merged.compartments = {merged_id: merged_name} + merged.add_metabolites(list(canonical.values())) + deleted_single: list[str] = [] + deduplicated: list[str] = [] + seen_signatures: dict[tuple, str] = {} + keep_reactions: list[cobra.Reaction] = [] + for r in out.reactions: + stoich = rewritten[r.id] + if drop_single_metabolite_reactions and len(stoich) <= 1: + deleted_single.append(r.id) + continue + if not stoich: # everything cancelled + deleted_single.append(r.id) + continue + sig = (frozenset(stoich.items()), bool(r.lower_bound < 0), bool(r.upper_bound > 0)) + if deduplicate_reactions and sig in seen_signatures: + deduplicated.append(r.id) + continue + seen_signatures[sig] = r.id + new_r = cobra.Reaction(r.id, name=r.name, lower_bound=r.lower_bound, + upper_bound=r.upper_bound) + new_r.add_metabolites({merged.metabolites.get_by_id(mid): c for mid, c in stoich.items()}) + new_r.gene_reaction_rule = r.gene_reaction_rule + if r.subsystem: + new_r.subsystem = r.subsystem + new_r.notes = dict(r.notes or {}) + keep_reactions.append(new_r) + merged.add_reactions(keep_reactions) + return merged, deleted_single, deduplicated + + +def copy_to_compartment( + model: cobra.Model, + reactions: Iterable[str], + target_compartment: str, + *, + target_compartment_name: str | None = None, + delete_original: bool = False, + id_suffix: str | None = None, +) -> tuple[cobra.Model, list[str], list[str]]: + """Copy a set of reactions into ``target_compartment``. RAVEN's ``copyToComps``. + + Returns ``(model_copy, new_reaction_ids, new_metabolite_ids)``. Use cases: + + * Build a dual-localised pathway (e.g. duplicate glycolysis into a peroxisome). + * Mirror a curated subsystem into an additional compartment as a draft to refine. + * Set up the input for a flux comparison between alternate compartmentalisations. + + Each copied reaction is given the id ``"_"`` (default + ``id_suffix=target_compartment``); each metabolite it touches is mapped to (or + created in) ``target_compartment`` with the same suffix convention. ``delete_original=True`` + moves the reactions instead of copying. + """ + out = model.copy() + suffix = id_suffix if id_suffix is not None else target_compartment + if target_compartment not in out.compartments: + out.compartments = {**out.compartments, + target_compartment: target_compartment_name or target_compartment} + + preexisting_met_ids = {x.id for x in out.metabolites} + new_rxn_ids: list[str] = [] + for rid in list(reactions): + if rid not in out.reactions: + raise ValueError(f"reaction {rid!r} not in model") + src = out.reactions.get_by_id(rid) + new_id = f"{rid}_{suffix}" + if new_id in out.reactions: + continue # already copied; idempotent + new_stoich: dict[cobra.Metabolite, float] = {} + for m, coeff in src.metabolites.items(): + target_met = _met_in_compartment(out, m, target_compartment, suffix=suffix) + new_stoich[target_met] = coeff + new_r = cobra.Reaction(new_id, name=src.name, + lower_bound=src.lower_bound, upper_bound=src.upper_bound) + new_r.add_metabolites(new_stoich) + new_r.gene_reaction_rule = src.gene_reaction_rule + if src.subsystem: + new_r.subsystem = src.subsystem + new_r.notes = dict(src.notes or {}) + out.add_reactions([new_r]) + new_rxn_ids.append(new_id) + if delete_original: + out.remove_reactions([src.id], remove_orphans=False) + + new_met_ids = [m.id for m in out.metabolites if m.id not in preexisting_met_ids] + return out, new_rxn_ids, new_met_ids + + +# ----------------------------------------------------------------- helpers + +def _base_id(m: cobra.Metabolite) -> str: + """Strip the trailing ``_`` suffix from a metabolite id (if present).""" + if m.compartment and m.id.endswith(f"_{m.compartment}"): + return m.id[: -(len(m.compartment) + 1)] + return m.id + + +def _met_in_compartment(model: cobra.Model, source: cobra.Metabolite, + compartment: str, *, suffix: str | None = None) -> cobra.Metabolite: + """Return (creating if needed) the copy of ``source`` in ``compartment``. + + The new metabolite id is ``"_"`` (default ``suffix=compartment``). + Already-existing copies are reused. + """ + if source.compartment == compartment: + return source + base = _base_id(source) + new_id = f"{base}_{suffix if suffix is not None else compartment}" + if new_id in model.metabolites: + return model.metabolites.get_by_id(new_id) + new_met = cobra.Metabolite(new_id, name=source.name, compartment=compartment, + formula=source.formula, charge=source.charge) + new_met.notes = dict(source.notes or {}) + model.add_metabolites([new_met]) + return new_met diff --git a/src/raven_python/manipulation/expand.py b/src/raven_python/manipulation/expand.py new file mode 100644 index 0000000..246f3b9 --- /dev/null +++ b/src/raven_python/manipulation/expand.py @@ -0,0 +1,124 @@ +"""Expand reactions with isozymes into one reaction per isozyme. + +Operates on cobra's GPR AST, so the model stays a plain ``cobra.Model`` throughout. + +Provenance: this implementation was first written for geckopy +(``geckopy/ec_model/pipeline/expand.py``, where it backed makeEcModel stage 5) +and is adopted here as its canonical home; geckopy will import it from raven_python +once raven_python is published. + +MATLAB-COMPAT: GECKO MATLAB and RAVEN ``expandModel.m`` use string manipulation +on grRules to detect and split isozymes. raven_python uses cobrapy's GPR AST +instead. Output should be equivalent for any well-formed GPR; cases that differ +are likely malformed GPR strings that the AST flags as invalid. +""" +from __future__ import annotations + +import ast +import copy + +import cobra +from cobra.core.gene import GPR + + +def _gpr_to_dnf(gpr: GPR) -> list[list[str]]: + """Convert a GPR to disjunctive normal form (list of AND-clauses). + + An empty GPR yields an empty list. A single clause (no OR anywhere) + yields a list of length 1. OR-of-ANDs yields one sublist per + disjunct, each containing the gene names ANDed together. + + Handles distributivity: ``g1 and (g2 or g3)`` becomes + ``[[g1, g2], [g1, g3]]``. + """ + if gpr is None or gpr.body is None: + return [] + return _node_to_dnf(gpr.body) + + +def _node_to_dnf(node) -> list[list[str]]: + """Recursive helper. Returns DNF as list of AND-clauses.""" + if isinstance(node, ast.Name): + return [[node.id]] + if isinstance(node, ast.BoolOp): + if isinstance(node.op, ast.Or): + result: list[list[str]] = [] + for child in node.values: + result.extend(_node_to_dnf(child)) + return result + if isinstance(node.op, ast.And): + clauses: list[list[str]] = [[]] + for child in node.values: + child_dnf = _node_to_dnf(child) + new_clauses: list[list[str]] = [] + for existing in clauses: + for extra in child_dnf: + new_clauses.append(existing + extra) + clauses = new_clauses + return clauses + raise ValueError(f"Unexpected GPR node type: {type(node).__name__}") + + +def expand_model(model: cobra.Model) -> list[str]: + """Split reactions with isozymes (OR in GPR) into one reaction per isozyme. + For each reaction whose GPR contains at least one OR, the reaction + is removed and replaced by one copy per disjunctive clause. The new + reactions get ID suffix ``_EXP_1``, ``_EXP_2``, etc. All other + fields (stoichiometry, bounds, name, subsystem) are copied verbatim; + only the GPR is simplified to the single AND-clause for that + isozyme. + + Reactions with no GPR, or with a GPR that has no OR, are left + untouched. + + Parameters + ---------- + model + A cobra.Model, mutated in place. + + Returns + ------- + list of str + Sorted IDs of newly added expanded reactions (those with + ``_EXP_N`` suffixes). The original reactions that were split + are no longer in the model. + """ + expansions: list[tuple[cobra.Reaction, list[list[str]]]] = [] + + for rxn in model.reactions: + if not rxn.gene_reaction_rule: + continue + clauses = _gpr_to_dnf(rxn.gpr) + if len(clauses) <= 1: + continue + expansions.append((rxn, clauses)) + + added_ids: list[str] = [] + for original_rxn, clauses in expansions: + new_rxns: list[cobra.Reaction] = [] + for i, clause in enumerate(clauses, start=1): + new_rxn = cobra.Reaction( + id=f"{original_rxn.id}_EXP_{i}", + name=original_rxn.name, + ) + new_rxn.lower_bound = original_rxn.lower_bound + new_rxn.upper_bound = original_rxn.upper_bound + new_rxn.add_metabolites(dict(original_rxn.metabolites.items())) + new_rxn.subsystem = original_rxn.subsystem + new_rxn.gene_reaction_rule = " and ".join(clause) + # Propagate per-reaction metadata (notably ec-code / annotations) + # so downstream functions see the same annotations on expanded + # reactions as on the original. Deep-copy so siblings are independent. + new_rxn.annotation = copy.deepcopy(original_rxn.annotation) + new_rxn.notes = copy.deepcopy(original_rxn.notes) + new_rxns.append(new_rxn) + + obj_coeff = original_rxn.objective_coefficient + model.remove_reactions([original_rxn]) + model.add_reactions(new_rxns) + if obj_coeff: # keep the original in the objective — sum over its isozyme copies + for new_rxn in new_rxns: + new_rxn.objective_coefficient = obj_coeff + added_ids.extend(r.id for r in new_rxns) + + return sorted(added_ids) diff --git a/src/raven_python/manipulation/irreversible.py b/src/raven_python/manipulation/irreversible.py new file mode 100644 index 0000000..3f64a68 --- /dev/null +++ b/src/raven_python/manipulation/irreversible.py @@ -0,0 +1,72 @@ +"""Convert reversible reactions to an irreversible (forward + reverse) form. + +cobrapy's own ``convert_to_irreversible`` was removed, so this is a genuine +implementation rather than a wrapper. + +Provenance: first written for geckopy +(``geckopy/ec_model/pipeline/preprocess.py``, makeEcModel stage 4, tagged +"RAVENpy candidate") and adopted here as its canonical home; geckopy will +import it from raven_python once raven_python is published. +""" +from __future__ import annotations + +import cobra + + +def convert_to_irreversible(model: cobra.Model) -> list[str]: + """Split non-exchange reversible reactions into a forward + reverse pair. + For each non-exchange reaction with ``lb < 0``: + + - The original reaction is kept as the forward direction. Its + lower bound is clamped to 0. + - A new reaction with the same ID plus a ``_REV`` suffix is added, + representing the reverse direction. Its stoichiometry is the + negation of the original, its bounds are ``(0, -original_lb)``, + and it inherits the name (with " (reversible)" appended) and the + gene-protein rule of the original. + + Exchange reactions (boundary reactions) are never split, regardless + of their bounds, matching MATLAB behavior where exchange reactions + are explicitly excluded from ``convertToIrrev``. + + Parameters + ---------- + model + A cobra.Model, mutated in place. + + Returns + ------- + list of str + Sorted IDs of newly added reverse reactions (the ones ending in + ``_REV``). The forward reactions retain their original IDs. + """ + reverse_rxns_to_add: list[cobra.Reaction] = [] + forward_updates: list[cobra.Reaction] = [] + + for rxn in model.reactions: + if rxn.boundary: + continue + if rxn.lower_bound >= 0: + continue + + original_lb = rxn.lower_bound + + rev_rxn = cobra.Reaction( + id=f"{rxn.id}_REV", + name=(f"{rxn.name} (reversible)" if rxn.name else f"{rxn.id}_REV"), + ) + rev_rxn.lower_bound = 0.0 + rev_rxn.upper_bound = -original_lb + rev_rxn.add_metabolites({m: -c for m, c in rxn.metabolites.items()}) + rev_rxn.gene_reaction_rule = rxn.gene_reaction_rule + + reverse_rxns_to_add.append(rev_rxn) + forward_updates.append(rxn) + + for rxn in forward_updates: + rxn.lower_bound = 0.0 + + if reverse_rxns_to_add: + model.add_reactions(reverse_rxns_to_add) + + return sorted(r.id for r in reverse_rxns_to_add) diff --git a/src/raven_python/manipulation/merge.py b/src/raven_python/manipulation/merge.py new file mode 100644 index 0000000..bfa1f24 --- /dev/null +++ b/src/raven_python/manipulation/merge.py @@ -0,0 +1,146 @@ +"""Merge several models into one. + +cobra's ``Model.merge`` is pairwise and matches everything strictly by id; this +merges **N** models and unifies metabolites by **name[compartment]** (so the same +compound under different ids in two models becomes one), while adding **all** +reactions without de-duplication +(a reaction whose ID already exists is renamed ``id_``). Genes are +unified by ID. Provenance (which source model each object came from) is recorded +in ``notes['origin']``. + +The bulk of RAVEN's function is struct field-padding and manual S-matrix +assembly, none of which is needed on ``cobra.Model``. +""" +from __future__ import annotations + +import copy +import warnings +from collections.abc import Iterable + +import cobra +from cobra import Metabolite, Model, Reaction + + +def _unique_id(existing, base: str, suffix: str) -> str: + """Return base, or base_suffix (then base_suffix_2, ...) if it collides.""" + if base not in existing: + return base + candidate = f"{base}_{suffix}" + n = 2 + while candidate in existing: + candidate = f"{base}_{suffix}_{n}" + n += 1 + return candidate + + +def merge_models( + models: Iterable[cobra.Model], + *, + match_by: str = "name", + track_origin: bool = True, +) -> cobra.Model: + """Merge models into a single new model. + Parameters + ---------- + models + The models to merge (two or more). A single model is returned as a copy. + match_by + How metabolites are unified across models: ``"name"`` (default) treats + metabolites with the same *name and compartment* as identical (IDs + ignored); ``"id"`` matches by metabolite ID. + track_origin + If True (default), record the source model's ``id`` in each reaction's, + metabolite's, and gene's ``notes['origin']``. + + Returns + ------- + cobra.Model + A new merged model (``id="MERGED"``). Reactions are **not** de-duplicated + — matching RAVEN, every reaction from every model is kept, with ID + collisions renamed ``id_``. + """ + models = list(models) + if not models: + raise ValueError("merge_models requires at least one model.") + if match_by not in ("name", "id"): + raise ValueError(f"match_by must be 'name' or 'id', got {match_by!r}") + if len(models) == 1: + return models[0].copy() + + merged = Model("MERGED") + comp_names: dict[str, str] = {} + met_lookup: dict = {} # name/comp or id key -> merged Metabolite + + def met_key(met: Metabolite): + return (met.name, met.compartment) if match_by == "name" else met.id + + def ensure_metabolite(src: Metabolite, origin: str) -> Metabolite: + key = met_key(src) + if key in met_lookup: + existing = met_lookup[key] + # Two source models can map to the same name[comp] (or id) with + # different formula/charge; silently picking the first-seen has + # quietly corrupted mass balance in the past. Warn so the caller + # sees the conflict. + if src.formula and existing.formula and src.formula != existing.formula: + warnings.warn( + f"merge_models: metabolite {existing.id!r} (from earlier model) " + f"and {src.id!r} (from {origin!r}) share key {key!r} but " + f"have different formulas ({existing.formula!r} vs {src.formula!r}); " + "keeping the first.", + stacklevel=3, + ) + if ( + existing.charge is not None + and src.charge is not None + and existing.charge != src.charge + ): + warnings.warn( + f"merge_models: metabolite {existing.id!r} (from earlier model) " + f"and {src.id!r} (from {origin!r}) share key {key!r} but " + f"have different charges ({existing.charge} vs {src.charge}); " + "keeping the first.", + stacklevel=3, + ) + return existing + new_id = _unique_id(merged.metabolites, src.id, origin) + new_met = Metabolite( + new_id, name=src.name, compartment=src.compartment, + formula=src.formula, charge=src.charge, + ) + new_met.annotation = copy.deepcopy(src.annotation) + new_met.notes = copy.deepcopy(src.notes) + if track_origin: + new_met.notes.setdefault("origin", origin) + merged.add_metabolites([new_met]) + met_lookup[key] = new_met + return new_met + + for model in models: + origin = model.id or "model" + comp_names.update(model.compartments) + genes_before = {g.id for g in merged.genes} + + for rxn in model.reactions: + new_id = _unique_id(merged.reactions, rxn.id, origin) + new_rxn = Reaction(new_id, name=rxn.name) + new_rxn.bounds = rxn.bounds + new_rxn.subsystem = rxn.subsystem + merged.add_reactions([new_rxn]) + new_rxn.add_metabolites( + {ensure_metabolite(m, origin): coef for m, coef in rxn.metabolites.items()} + ) + if rxn.gene_reaction_rule: + new_rxn.gene_reaction_rule = rxn.gene_reaction_rule + new_rxn.annotation = copy.deepcopy(rxn.annotation) + new_rxn.notes = copy.deepcopy(rxn.notes) + if track_origin: + new_rxn.notes.setdefault("origin", origin) + + if track_origin: + for gene in merged.genes: + if gene.id not in genes_before: + gene.notes.setdefault("origin", origin) + + merged._compartments.update(comp_names) + return merged diff --git a/src/raven_python/manipulation/parameters.py b/src/raven_python/manipulation/parameters.py new file mode 100644 index 0000000..f349804 --- /dev/null +++ b/src/raven_python/manipulation/parameters.py @@ -0,0 +1,78 @@ +"""Set reaction bounds to a sign-aware ±% variance band around measured values. + +Cobra has no idiom for the *variance band* case (e.g. "5 ± 20 %"); the other common +bound-setting cases are cobra one-liners: + +* fixed lb / ub → ``reaction.lower_bound`` / ``upper_bound`` / ``reaction.bounds`` +* equality → ``reaction.bounds = (v, v)`` +* objective → ``model.objective = {reaction: coeff}`` +* unconstrained → ``reaction.bounds = cobra.Configuration().bounds`` +""" +from __future__ import annotations + +from collections.abc import Iterable, Sequence + +import cobra +from cobra import Reaction + +Number = int | float + + +def _resolve(model: cobra.Model, reactions) -> list[Reaction]: + if isinstance(reactions, (str, Reaction)): + reactions = [reactions] + out: list[Reaction] = [] + for r in reactions: + if isinstance(r, Reaction): + out.append(r) + elif r in model.reactions: + out.append(model.reactions.get_by_id(r)) + else: + raise ValueError(f"Reaction {r!r} not found in the model.") + return out + + +def _broadcast(value, n: int) -> list[float]: + if isinstance(value, (int, float)): + return [float(value)] * n + vals = [float(v) for v in value] + if len(vals) != n: + raise ValueError( + f"Expected 1 or {n} values to match the reactions, got {len(vals)}." + ) + return vals + + +def set_variance_bounds( + model: cobra.Model, + reactions: str | Reaction | Iterable, + values: Number | Sequence[Number], + percent: Number, +) -> list[Reaction]: + """Constrain reactions to a ``±percent/2`` band around measured values. + + For a measured value ``v`` and ``percent`` ``p``, the bounds become + ``v * (1 - p/200) .. v * (1 + p/200)`` — i.e. ``percent`` is the *total* + width, split half above and half below. For a negative ``v`` the two are + swapped so that ``lb <= ub``. E.g. ``percent=5`` gives 97.5 %..102.5 % of ``v``. + + Parameters + ---------- + reactions + Reaction IDs or objects. + values + Measured value per reaction; a scalar is broadcast to all reactions. + percent + Total band width as a percentage. + + Returns + ------- + list of cobra.Reaction + The reactions affected. + """ + rxns = _resolve(model, reactions) + half = percent / 200.0 + for rxn, v in zip(rxns, _broadcast(values, len(rxns)), strict=True): + lo, hi = v * (1 - half), v * (1 + half) + rxn.bounds = (hi, lo) if v < 0 else (lo, hi) + return rxns diff --git a/src/raven_python/manipulation/remove.py b/src/raven_python/manipulation/remove.py new file mode 100644 index 0000000..492de36 --- /dev/null +++ b/src/raven_python/manipulation/remove.py @@ -0,0 +1,120 @@ +"""Remove metabolites or genes from a model. + +For removing *reactions*, use cobra directly: +``cobra.Model.remove_reactions(reactions, remove_orphans=...)``. + +The two functions here delegate the core to cobra and add the cobra-absent behaviour: + +* ``remove_metabolites`` — cobra matches metabolites by ID; RAVEN's ``isNames`` + deletes a metabolite in **every compartment at once** by name. That name + resolution is the *sole* reason this wrapper exists (see the note on it). +* ``remove_genes`` — cobra's ``cobra.manipulation.remove_genes`` already rewrites + GPRs through the boolean AST (removing one gene of ``A and B`` empties the + rule, of ``A or B`` keeps the other) — exactly RAVEN's intent, without its + ``eval``. The gap is RAVEN's default of **constraining** flux-blocked reactions + to zero instead of deleting them; exposed as ``blocked_reactions``. +""" +from __future__ import annotations + +from collections.abc import Iterable + +import cobra +from cobra import Gene, Metabolite +from cobra.manipulation import remove_genes as _cobra_remove_genes + + +def _as_list(obj) -> list: + if isinstance(obj, (str, Metabolite, Gene)): + return [obj] + return list(obj) + + +def remove_metabolites( + model: cobra.Model, + metabolites: str | Metabolite | Iterable, + *, + by_name: bool = False, + destructive: bool = False, +) -> None: + """Remove metabolites, optionally matching by name across all compartments. + + Parameters + ---------- + by_name + If True, ``metabolites`` are metabolite *names*; every metabolite with a + matching name is removed, regardless of compartment (RAVEN ``isNames``). + If False, they are IDs/objects, resolved via cobra. + destructive + Passed to cobra: if True, also remove every reaction the metabolite + participates in. + + Note + ---- + With ``by_name=False`` this is just ``model.remove_metabolites`` — the + ``by_name`` cross-compartment deletion is the only thing this adds over cobra. + """ + if by_name: + wanted = set(_as_list(metabolites)) + targets = [m for m in model.metabolites if m.name in wanted] + else: + targets = model.metabolites.get_by_any(_as_list(metabolites)) + if targets: + model.remove_metabolites(targets, destructive=destructive) + + +def remove_genes( + model: cobra.Model, + genes: str | Gene | Iterable, + *, + blocked_reactions: str = "remove", + remove_orphans: bool = False, +) -> list[str]: + """Remove genes and handle reactions left unable to carry flux. + + GPR rewriting (with correct AND/OR semantics) and gene deletion are done by cobra; + this adds a policy for reactions whose GPR becomes empty (no enzyme left): + + * ``"remove"`` — delete them (cobra's default). + * ``"constrain"`` — keep them but set bounds to ``(0, 0)``. + * ``"keep"`` — leave them with an empty GPR and unchanged bounds. + + ``remove_orphans`` (only meaningful with ``blocked_reactions="remove"``) + passes through to cobra: drop metabolites *and* genes orphaned by the removal. + + Returns + ------- + list of str + IDs of the reactions that became flux-blocked (had a GPR, now empty). + """ + if blocked_reactions not in ("remove", "constrain", "keep"): + raise ValueError( + f"blocked_reactions must be 'remove', 'constrain', or 'keep', " + f"got {blocked_reactions!r}" + ) + + # Resolve to gene IDs that are actually in the model (RAVEN filters likewise). + requested = [g.id if isinstance(g, Gene) else g for g in _as_list(genes)] + present = [gid for gid in requested if gid in model.genes] + if not present: + return [] + + # Reactions touched by these genes that currently have a GPR. + affected = set() + for gid in present: + affected.update(r.id for r in model.genes.get_by_id(gid).reactions) + had_gpr = {rid for rid in affected if model.reactions.get_by_id(rid).gene_reaction_rule} + + # cobra rewrites GPRs (AST) and removes the gene objects; we manage reactions. + _cobra_remove_genes(model, present, remove_reactions=False) + + blocked = [ + rid for rid in had_gpr if not model.reactions.get_by_id(rid).gene_reaction_rule + ] + + if blocked_reactions == "remove": + model.remove_reactions(blocked, remove_orphans=remove_orphans) + elif blocked_reactions == "constrain": + for rid in blocked: + model.reactions.get_by_id(rid).bounds = (0, 0) + + return sorted(blocked) diff --git a/src/raven_python/manipulation/simplify.py b/src/raven_python/manipulation/simplify.py new file mode 100644 index 0000000..2deaccd --- /dev/null +++ b/src/raven_python/manipulation/simplify.py @@ -0,0 +1,229 @@ +"""Reduce a model by removing/merging reactions that cannot carry flux. + +Four reduction modes that cobra does not cover out of the box: +``remove_dead_end_reactions`` (reactions whose substrates have no producer), +``remove_duplicate_reactions``, ``constrain_reversible_reactions`` (tighten bounds +via FVA), and ``group_linear_reactions`` (lossy fold of unit-stoichiometry chains +into one reaction; drops gene rules). + +Cobra-covered modes that you'd reach for separately: + +* No-flux removal → ``cobra.flux_analysis.find_blocked_reactions``. +* Zero-interval removal → filter reactions with ``bounds == (0, 0)`` then prune. +""" +from __future__ import annotations + +import math +from collections.abc import Iterable + +import cobra +from cobra.flux_analysis import flux_variability_analysis + +from raven_python.manipulation.irreversible import convert_to_irreversible + + +def _prune_orphan_metabolites(model: cobra.Model) -> list[str]: + orphans = [m for m in model.metabolites if not m.reactions] + if orphans: + model.remove_metabolites(orphans) + return [m.id for m in orphans] + + +def _can_produce_and_consume(met) -> tuple[bool, bool]: + """Whether the network can both produce and consume ``met`` (given directions).""" + produce = consume = False + for rxn in met.reactions: + coef = rxn.get_coefficient(met) + if coef > 0: + produce |= rxn.upper_bound > 0 + consume |= rxn.lower_bound < 0 + elif coef < 0: + consume |= rxn.upper_bound > 0 + produce |= rxn.lower_bound < 0 + return produce, consume + + +def remove_dead_end_reactions( + model: cobra.Model, *, reserved: Iterable[str] | None = None +) -> tuple[list[str], list[str]]: + """Iteratively remove dead-end reactions and metabolites. + + A metabolite + is a dead end if it participates in only one reaction, or if (accounting for + reaction directionality) it can only be produced or only consumed — such + metabolites cannot carry steady-state flux, so the reactions touching them + are removed. Repeats until stable. + + Returns ``(removed_reaction_ids, removed_metabolite_ids)``. + """ + reserved = set(reserved or []) + removed_rxns: list[str] = [] + removed_mets: list[str] = [] + while True: + removed_mets += _prune_orphan_metabolites(model) + dead = [ + m + for m in model.metabolites + if len(m.reactions) <= 1 or not all(_can_produce_and_consume(m)) + ] + if not dead: + break + rxns = {r for m in dead for r in m.reactions} + to_delete = [r for r in rxns if r.id not in reserved] + if not to_delete: + break + removed_rxns += [r.id for r in to_delete] + model.remove_reactions(to_delete) + return removed_rxns, removed_mets + + +def _signature(rxn): + mets = frozenset((m.id, c) for m, c in rxn.metabolites.items()) + return (mets, rxn.lower_bound, rxn.upper_bound, rxn.objective_coefficient) + + +def remove_duplicate_reactions( + model: cobra.Model, *, reserved: Iterable[str] | None = None +) -> list[str]: + """Remove all-but-one of each set of duplicate reactions. + + Reactions are duplicates when they have identical stoichiometry, bounds, and + objective coefficient. One of each set is kept (reserved reactions are never + removed). Returns the removed reaction IDs. + """ + reserved = set(reserved or []) + groups: dict = {} + for rxn in model.reactions: + groups.setdefault(_signature(rxn), []).append(rxn) + + removed: list[str] = [] + for rxns in groups.values(): + if len(rxns) <= 1: + continue + keep = rxns[-1] + to_remove = [r for r in rxns if r is not keep and r.id not in reserved] + if to_remove: + removed += [r.id for r in to_remove] + model.remove_reactions(to_remove) + return removed + + +def constrain_reversible_reactions( + model: cobra.Model, *, eps: float = 1e-9 +) -> list[str]: + """Constrain reversible reactions that can only carry flux one way. + + Runs FVA on + each reversible reaction; if it can only carry forward flux its lower bound + is set to 0, and if it can only carry reverse flux it is flipped to a forward + reaction (stoichiometry, bounds, and objective negated). Returns the changed + reaction IDs. + """ + revs = [r for r in model.reactions if r.lower_bound < 0 < r.upper_bound] + if not revs: + return [] + # Infeasible models surface as either OptimizationError (Gurobi/HiGHS) or + # NaN-filled ranges (some optlang backends silently). Catch both and raise + # a single clear error — the original ``abs(NaN) < eps`` comparison would + # have silently no-op'd, letting bogus "all reactions truly reversible" + # decisions sneak through. + try: + fva = flux_variability_analysis( + model, reaction_list=revs, fraction_of_optimum=0.0 + ) + except Exception as exc: # noqa: BLE001 - solver-family agnostic + raise RuntimeError( + "constrain_reversible_reactions: FVA failed — the model is likely " + "infeasible at fraction_of_optimum=0. Fix the infeasibility first " + "(often a missing exchange or an over-constrained essential). " + f"({exc})" + ) from exc + if fva[["minimum", "maximum"]].isna().any().any(): + raise RuntimeError( + "constrain_reversible_reactions: FVA returned NaN ranges — the " + "model is infeasible at fraction_of_optimum=0. Fix the infeasibility " + "first (often a missing exchange or an over-constrained essential)." + ) + + changed: list[str] = [] + for rxn in revs: + lo = fva.at[rxn.id, "minimum"] + hi = fva.at[rxn.id, "maximum"] + # Guard against ±inf ranges (unbounded objective): treat them as truly + # reversible rather than "zero" by the abs(·) < eps check. + if math.isinf(lo) or math.isinf(hi): + continue + min_zero, max_zero = abs(lo) < eps, abs(hi) < eps + if min_zero == max_zero: # both ~0 (blocked) or both nonzero (truly reversible) + continue + if max_zero: # only reverse flux → flip to a forward reaction + old_lb = rxn.lower_bound + rxn.add_metabolites({m: -2 * c for m, c in rxn.metabolites.items()}) + rxn.bounds = (0.0, -old_lb) + rxn.objective_coefficient = -rxn.objective_coefficient + else: # only forward flux + rxn.lower_bound = 0.0 + changed.append(rxn.id) + return changed + + +def group_linear_reactions( + model: cobra.Model, *, reserved: Iterable[str] | None = None +) -> None: + """Merge linear (single-producer, single-consumer) reaction chains. + + **Lossy**: gene-reaction + associations are discarded (RAVEN does the same), since merged reactions have + no meaningful combined GPR. The model is first made irreversible, then any + metabolite that is produced by exactly one reaction and consumed by exactly + one reaction is eliminated by merging the two reactions. Mutates in place. + """ + reserved = set(reserved or []) + + # Lossy: drop all gene information. + for rxn in model.reactions: + rxn.gene_reaction_rule = "" + for gene in list(model.genes): + model.genes.remove(gene) + + convert_to_irreversible(model) + + # Worklist of metabolites to (re)consider for merging. Each metabolite + # participating in a merge can expose new linear chains in its neighbours, + # so we re-enqueue the touched mets rather than restart the whole scan + # (the old O(n²·m) restart-after-every-merge loop). + pending: list = list(model.metabolites) + seen_in_pass: set = set() + while pending: + met = pending.pop() + if met not in model.metabolites: # removed in a previous merge + continue + rxns = list(met.reactions) + if len(rxns) != 2 or any(r.id in reserved for r in rxns): + continue + r1, r2 = rxns + c1, c2 = r1.get_coefficient(met), r2.get_coefficient(met) + if (c1 > 0) == (c2 > 0): # need one producer and one consumer + continue + ratio = abs(c1 / c2) + new_lb = max(r1.lower_bound, r2.lower_bound / ratio) + new_ub = min(r1.upper_bound, r2.upper_bound / ratio) + new_obj = r1.objective_coefficient + r2.objective_coefficient * ratio + # Re-enqueue every metabolite touched by either side — the merge can + # turn neighbours into single-producer/consumer chains in turn. + touched = {m for m in r1.metabolites} | {m for m in r2.metabolites} + # Merge r2*ratio into r1; the shared metabolite cancels and is dropped. + r1.add_metabolites({m: c * ratio for m, c in r2.metabolites.items()}) + model.remove_reactions([r2]) + r1.bounds = (new_lb, new_ub) + r1.objective_coefficient = new_obj + seen_in_pass.clear() + for m in touched: + if m in model.metabolites and id(m) not in seen_in_pass: + seen_in_pass.add(id(m)) + pending.append(m) + # One terminal cleanup pass (cheap; only what remains). + empty = [r for r in model.reactions if not r.metabolites] + if empty: + model.remove_reactions(empty) + _prune_orphan_metabolites(model) diff --git a/src/raven_python/manipulation/transfer.py b/src/raven_python/manipulation/transfer.py new file mode 100644 index 0000000..b867f02 --- /dev/null +++ b/src/raven_python/manipulation/transfer.py @@ -0,0 +1,144 @@ +"""Copy reactions (with their metabolites and genes) from another model. + +cobra's ``Model.merge`` / ``add_reactions`` match metabolites strictly by id. This +transfers a chosen set of reactions from a *source* model into a draft, matching +metabolites by **name[compartment]** instead — so a compound present in both models +under different ids is reused rather than duplicated, and only genuinely new +metabolites are created (copying the source's id, formula, +charge, and annotation). New genes are auto-created by cobra when the GPR is set. +This is the post-``getModelFromHomology`` "copy a few more reactions across" +workflow. +""" +from __future__ import annotations + +import copy +from collections.abc import Iterable + +import cobra +from cobra import Metabolite, Reaction + +from raven_python.manipulation.add import _new_met_id + + +def _name_comp(met: Metabolite) -> str: + return f"{met.name}[{met.compartment}]" + + +def add_reactions_from_model( + model: cobra.Model, + source_model: cobra.Model, + reactions: str | Iterable[str], + *, + genes: bool | str | Iterable[str] = False, + note: str | None = "Added via add_reactions_from_model()", + confidence: int | None = None, +) -> list[Reaction]: + """Copy reactions from ``source_model`` into ``model``. + Parameters + ---------- + model + Draft model to copy into (mutated in place). + source_model + Model to copy reactions from. + reactions + Reaction ID(s) in ``source_model``. Reactions already present in + ``model`` (by ID) are skipped. + genes + ``False`` (default): add reactions without GPRs. ``True``: copy each + reaction's GPR from the source. A string: use it as the GPR for every + added reaction. A list: per-reaction GPRs (matching the reactions that + are actually added). New genes are created automatically. + note + Stored in each added reaction's ``notes['note']`` (set ``None`` to skip). + confidence + If given, stored in each added reaction's ``notes['confidence_score']``. + + Returns + ------- + list of cobra.Reaction + The reactions added, in input order. + """ + rxn_ids = [reactions] if isinstance(reactions, str) else list(reactions) + missing = [r for r in rxn_ids if r not in source_model.reactions] + if missing: + raise ValueError(f"Reactions not found in the source model: {missing}") + + new_ids = [r for r in rxn_ids if r not in model.reactions] + if not new_ids: + raise ValueError("All reactions are already in the model.") + source_rxns = [source_model.reactions.get_by_id(r) for r in new_ids] + + if genes is False: + rules = [""] * len(source_rxns) + elif genes is True: + rules = [r.gene_reaction_rule for r in source_rxns] + elif isinstance(genes, str): + rules = [genes] * len(source_rxns) + else: + rules = list(genes) + if len(rules) != len(source_rxns): + raise ValueError( + f"genes list has {len(rules)} rules but {len(source_rxns)} " + "reactions are being added." + ) + + # Match metabolites by name[comp]; create only the genuinely new ones. + draft_by_name = {_name_comp(m): m for m in model.metabolites} + new_mets: list[Metabolite] = [] + pending: set[str] = set() + # Track ids minted within this batch so two source mets that share an id + # but differ in name[comp] don't collide when add_metabolites runs. + pending_ids: set[str] = set() + for srx in source_rxns: + for met in srx.metabolites: + key = _name_comp(met) + if key in draft_by_name or key in pending: + continue + pending.add(key) + if met.id not in model.metabolites and met.id not in pending_ids: + new_id = met.id + else: + # _new_met_id only knows the model; loop past in-batch hits too. + new_id = _new_met_id(model, "m") + while new_id in pending_ids: + n = int(new_id[1:]) + 1 + new_id = f"m{n}" + while new_id in model.metabolites: + n += 1 + new_id = f"m{n}" + pending_ids.add(new_id) + new_met = Metabolite( + new_id, + name=met.name, + compartment=met.compartment, + formula=met.formula, + charge=met.charge, + ) + new_met.annotation = copy.deepcopy(met.annotation) + new_met.notes = copy.deepcopy(met.notes) + new_mets.append(new_met) + draft_by_name[key] = new_met + if new_mets: + model.add_metabolites(new_mets) + + added: list[Reaction] = [] + for srx, rule in zip(source_rxns, rules, strict=True): + rxn = Reaction(srx.id, name=srx.name) + rxn.bounds = srx.bounds + rxn.subsystem = srx.subsystem + model.add_reactions([rxn]) + rxn.add_metabolites( + {draft_by_name[_name_comp(met)]: coef for met, coef in srx.metabolites.items()} + ) + if rule: + rxn.gene_reaction_rule = rule + rxn.annotation = copy.deepcopy(srx.annotation) + notes = copy.deepcopy(srx.notes) + if note is not None: + notes["note"] = note + if confidence is not None: + notes["confidence_score"] = confidence + rxn.notes = notes + added.append(rxn) + + return added diff --git a/src/raven_python/manipulation/transport.py b/src/raven_python/manipulation/transport.py new file mode 100644 index 0000000..d0c1bf1 --- /dev/null +++ b/src/raven_python/manipulation/transport.py @@ -0,0 +1,157 @@ +"""Add transport reactions between compartments. + +cobra has no transport-reaction primitive. For each metabolite this matches the +species by *name* across compartments (the source in ``from_compartment`` and its +same-named twin in each target compartment), optionally creating the target +metabolite, and +builds a ``-1 from / +1 to`` reaction with a sequential ``tr_0001`` ID. +""" +from __future__ import annotations + +import re +import warnings +from collections.abc import Iterable + +import cobra +from cobra import Metabolite, Reaction + +from raven_python.manipulation.add import _new_met_id + + +def _index_by_name(mets: Iterable[Metabolite], compartment: str) -> dict[str, Metabolite]: + """Index metabolites by name, warning when a name is duplicated. + + Same-name duplicates in a single compartment are unusual but legal in cobra, + and the previous one-pass dict comprehension silently dropped all but one. + """ + out: dict[str, list[Metabolite]] = {} + for m in mets: + out.setdefault(m.name, []).append(m) + chosen: dict[str, Metabolite] = {} + for name, group in out.items(): + if len(group) > 1: + warnings.warn( + f"Multiple metabolites named {name!r} in compartment {compartment!r} " + f"({[m.id for m in group]}); using {group[0].id!r} for transport.", + stacklevel=3, + ) + chosen[name] = group[0] + return chosen + + +def _transport_id_factory(model: cobra.Model, prefix: str): + pattern = re.compile(rf"^{re.escape(prefix)}(\d+)$") + used = [int(m.group(1)) for r in model.reactions if (m := pattern.match(r.id))] + counter = max(used) + 1 if used else 1 + + def next_id() -> str: + nonlocal counter + while f"{prefix}{counter:04d}" in model.reactions: + counter += 1 + rid = f"{prefix}{counter:04d}" + counter += 1 + return rid + + return next_id + + +def add_transport_reactions( + model: cobra.Model, + from_compartment: str, + to_compartments: str | Iterable[str], + metabolite_names: str | Iterable[str] | None = None, + *, + reversible: bool = True, + only_to_existing: bool = True, + id_prefix: str = "tr_", +) -> list[Reaction]: + """Add transport reactions from one compartment to one or more others. + Parameters + ---------- + from_compartment + Source compartment id. + to_compartments + Target compartment id(s). + metabolite_names + Names of metabolites to transport. Default: every metabolite in + ``from_compartment``. + reversible + If True (default), bounds span the cobra configuration default + (reversible); otherwise lower bound 0. + only_to_existing + If True (default), only transport a metabolite into a target + compartment where a same-named metabolite already exists. If False, + create the missing target metabolite (copying name/formula/charge/ + annotation from the source) before adding the transport. + id_prefix + Prefix for the sequential reaction IDs (``tr_0001``, ...). + + Returns + ------- + list of cobra.Reaction + The transport reactions added, in creation order. + """ + # cobra's `model.compartments` only lists compartments that have metabolites; + # include registered-but-empty ones so transport can target an empty compartment. + known = set(model.compartments) | set(model._compartments) + if from_compartment not in known: + raise ValueError(f"Compartment {from_compartment!r} is not in the model.") + if isinstance(to_compartments, str): + to_compartments = [to_compartments] + else: + to_compartments = list(to_compartments) + for comp in to_compartments: + if comp not in known: + raise ValueError(f"Compartment {comp!r} is not in the model.") + + source = _index_by_name( + (m for m in model.metabolites if m.compartment == from_compartment), + from_compartment, + ) + if metabolite_names is None: + names = list(source) + else: + names = [metabolite_names] if isinstance(metabolite_names, str) else list(metabolite_names) + missing = [n for n in names if n not in source] + if missing: + raise ValueError( + f"Metabolites not found in compartment {from_compartment!r}: {missing}" + ) + + cfg = cobra.Configuration() + bounds = (cfg.lower_bound, cfg.upper_bound) if reversible else (0.0, cfg.upper_bound) + from_name = model.compartments.get(from_compartment) or from_compartment + next_id = _transport_id_factory(model, id_prefix) + + added: list[Reaction] = [] + for to_comp in to_compartments: + to_name = model.compartments.get(to_comp) or to_comp + targets = _index_by_name( + (m for m in model.metabolites if m.compartment == to_comp), + to_comp, + ) + for name in names: + src = source[name] + dst = targets.get(name) + if dst is None: + if only_to_existing: + continue + dst = Metabolite( + _new_met_id(model, "m"), + name=name, + compartment=to_comp, + formula=src.formula, + charge=src.charge, + ) + dst.annotation = dict(src.annotation) + model.add_metabolites([dst]) + targets[name] = dst + + rxn = Reaction(next_id()) + rxn.name = f"{name} transport, {from_name}-{to_name}" + rxn.bounds = bounds + model.add_reactions([rxn]) + rxn.add_metabolites({src: -1, dst: 1}) + added.append(rxn) + + return added diff --git a/tests/test_change_grrules.py b/tests/test_change_grrules.py new file mode 100644 index 0000000..d33f723 --- /dev/null +++ b/tests/test_change_grrules.py @@ -0,0 +1,49 @@ +"""Tests for change_gene_reaction_rules (changeGrRules port).""" +import cobra +import pytest + +from raven_python.manipulation import add_reactions_from_equations, change_gene_reaction_rules + + +@pytest.fixture +def model(): + m = cobra.Model("t") + m.add_metabolites( + [cobra.Metabolite("a_c", compartment="c"), cobra.Metabolite("b_c", compartment="c")] + ) + add_reactions_from_equations( + m, + [ + {"id": "R1", "equation": "a_c --> b_c", "gene_reaction_rule": "G1"}, + {"id": "R2", "equation": "a_c --> b_c"}, + ], + ) + return m + + +def test_replace_rule_and_create_genes(model): + (rxn,) = change_gene_reaction_rules(model, {"R1": "G2 and G3"}) + assert rxn.gene_reaction_rule == "G2 and G3" + assert {g.id for g in rxn.genes} == {"G2", "G3"} + assert {"G2", "G3"} <= {g.id for g in model.genes} + + +def test_append_rule(model): + change_gene_reaction_rules(model, {"R1": "G4"}, replace=False) + # (G1) or (G4), normalised by cobra + assert model.reactions.get_by_id("R1").gene_reaction_rule == "G1 or G4" + + +def test_append_when_empty_is_just_new(model): + change_gene_reaction_rules(model, {"R2": "G5"}, replace=False) + assert model.reactions.get_by_id("R2").gene_reaction_rule == "G5" + + +def test_batch(model): + changed = change_gene_reaction_rules(model, {"R1": "GA", "R2": "GB"}) + assert [r.id for r in changed] == ["R1", "R2"] + + +def test_unknown_reaction_errors(model): + with pytest.raises(ValueError, match="not found"): + change_gene_reaction_rules(model, {"NOPE": "G1"}) diff --git a/tests/test_manipulation_add.py b/tests/test_manipulation_add.py new file mode 100644 index 0000000..2a3a9d3 --- /dev/null +++ b/tests/test_manipulation_add.py @@ -0,0 +1,278 @@ +"""Tests for raven_python.manipulation.add (addRxns port).""" +import cobra +import pytest + +from raven_python.manipulation import add_reactions_from_equations +from raven_python.utils.parse import parse_name_comp + + +@pytest.fixture +def model(): + m = cobra.Model("t") + m.add_metabolites( + [ + cobra.Metabolite("atp_c", name="ATP", compartment="c"), + cobra.Metabolite("h2o_c", name="H2O", compartment="c"), + cobra.Metabolite("adp_c", name="ADP", compartment="c"), + cobra.Metabolite("pi_c", name="phosphate", compartment="c"), + ] + ) + return m + + +# --- parse_name_comp ------------------------------------------------------- + +@pytest.mark.parametrize( + "token,expected", + [ + ("ATP[c]", ("ATP", "c")), + ("ATP", ("ATP", None)), + (" ATP[c] ", ("ATP", "c")), + ("weird[name][m]", ("weird[name]", "m")), + ], +) +def test_parse_name_comp(token, expected): + assert parse_name_comp(token) == expected + + +# --- id mode (eqnType 1) --------------------------------------------------- + +def test_add_by_id_basic_and_reversibility(model): + (rxn,) = add_reactions_from_equations( + model, [{"id": "R1", "equation": "atp_c + h2o_c <=> adp_c + pi_c"}] + ) + assert rxn.id == "R1" + assert rxn.reversibility is True + assert {m.id: rxn.get_coefficient(m.id) for m in rxn.metabolites} == { + "atp_c": -1.0, + "h2o_c": -1.0, + "adp_c": 1.0, + "pi_c": 1.0, + } + + +def test_irreversible_arrows(model): + rxns = add_reactions_from_equations( + model, + [ + {"id": "R1", "equation": "atp_c --> adp_c"}, + {"id": "R2", "equation": "atp_c => adp_c"}, + ], + ) + for r in rxns: + assert r.lower_bound == 0.0 + assert r.reversibility is False + + +def test_coefficients(model): + (rxn,) = add_reactions_from_equations( + model, [{"id": "R1", "equation": "2 atp_c + 1.5 h2o_c --> adp_c"}] + ) + assert rxn.get_coefficient("atp_c") == -2.0 + assert rxn.get_coefficient("h2o_c") == -1.5 + + +def test_id_mode_creates_new_met_in_compartment(model): + add_reactions_from_equations( + model, + [{"id": "R1", "equation": "atp_c --> amp_c"}], + compartment="c", + ) + assert "amp_c" in model.metabolites + assert model.metabolites.get_by_id("amp_c").compartment == "c" + + +def test_id_mode_new_met_without_compartment_errors(model): + with pytest.raises(ValueError, match="no compartment"): + add_reactions_from_equations(model, [{"id": "R1", "equation": "atp_c --> amp_c"}]) + + +# --- name mode (eqnType 2) ------------------------------------------------- + +def test_name_mode_matches_existing_by_name(model): + (rxn,) = add_reactions_from_equations( + model, + [{"id": "R1", "equation": "ATP + H2O <=> ADP + phosphate"}], + mets_by="name", + compartment="c", + ) + # resolved to the existing _c metabolites, not new ones + assert {m.id for m in rxn.metabolites} == {"atp_c", "h2o_c", "adp_c", "pi_c"} + assert len(model.metabolites) == 4 + + +def test_name_mode_creates_new_met_with_auto_id(model): + add_reactions_from_equations( + model, + [{"id": "R1", "equation": "ATP --> AMP"}], + mets_by="name", + compartment="c", + ) + new = [m for m in model.metabolites if m.name == "AMP"] + assert len(new) == 1 + assert new[0].id == "m1" + assert new[0].compartment == "c" + + +def test_name_mode_requires_compartment(model): + with pytest.raises(ValueError, match="needs a compartment"): + add_reactions_from_equations( + model, [{"id": "R1", "equation": "ATP --> ADP"}], mets_by="name" + ) + + +# --- name[comp] mode (eqnType 3) ------------------------------------------- + +def test_name_comp_syntax(model): + model.add_metabolites([cobra.Metabolite("atp_m", name="ATP", compartment="m")]) + (rxn,) = add_reactions_from_equations( + model, + [{"id": "R1", "equation": "ATP[c] --> ATP[m]"}], + mets_by="name", + compartment="c", + ) + # matched ATP in two different compartments by name[comp] + assert {m.id for m in rxn.metabolites} == {"atp_c", "atp_m"} + + +# --- genes ----------------------------------------------------------------- + +def test_gene_rule_auto_creates_genes(model): + (rxn,) = add_reactions_from_equations( + model, + [{"id": "R1", "equation": "atp_c --> adp_c", "gene_reaction_rule": "G1 and G2"}], + ) + assert {g.id for g in rxn.genes} == {"G1", "G2"} + assert {g.id for g in model.genes} == {"G1", "G2"} + + +def test_strict_genes_errors_on_unknown(model): + with pytest.raises(ValueError, match="genes not in the model"): + add_reactions_from_equations( + model, + [{"id": "R1", "equation": "atp_c --> adp_c", "gene_reaction_rule": "G1"}], + allow_new_genes=False, + ) + + +def test_strict_genes_ok_when_present(model): + model.genes.append(cobra.core.gene.Gene("G1")) + (rxn,) = add_reactions_from_equations( + model, + [{"id": "R1", "equation": "atp_c --> adp_c", "gene_reaction_rule": "G1"}], + allow_new_genes=False, + ) + assert rxn.gene_reaction_rule == "G1" + + +# --- guards & extras ------------------------------------------------------- + +def test_duplicate_reaction_id_errors(model): + model.add_reactions([cobra.Reaction("R1")]) + with pytest.raises(ValueError, match="already exists"): + add_reactions_from_equations(model, [{"id": "R1", "equation": "atp_c --> adp_c"}]) + + +def test_strict_mets_errors(model): + with pytest.raises(ValueError, match="allow_new_mets"): + add_reactions_from_equations( + model, + [{"id": "R1", "equation": "atp_c --> amp_c"}], + compartment="c", + allow_new_mets=False, + ) + + +def test_explicit_bounds_override_arrow(model): + (rxn,) = add_reactions_from_equations( + model, + [{"id": "R1", "equation": "atp_c <=> adp_c", "bounds": (0, 50), "name": "myrxn"}], + ) + assert rxn.bounds == (0, 50) + assert rxn.name == "myrxn" + + +def test_net_zero_metabolite_dropped(model): + # atp_c on both sides nets to zero and is removed. + (rxn,) = add_reactions_from_equations( + model, [{"id": "R1", "equation": "atp_c + h2o_c --> atp_c + adp_c"}] + ) + assert "atp_c" not in {m.id for m in rxn.metabolites} + assert {m.id for m in rxn.metabolites} == {"h2o_c", "adp_c"} + + +def test_missing_equation_errors(model): + with pytest.raises(ValueError, match="missing required 'equation'"): + add_reactions_from_equations(model, [{"id": "R1"}]) + + +def test_no_arrow_errors(model): + with pytest.raises(ValueError, match="No reaction arrow"): + add_reactions_from_equations(model, [{"id": "R1", "equation": "atp_c + h2o_c"}]) + + +# --- regression: leading-number metabolite name (known_issues.md A1) ------- + +def test_name_mode_preserves_leading_number_name(model): + """A metabolite name that begins with a number isn't misparsed as a coefficient. + + Before the fix the token ``"2 oxoglutarate"`` was parsed as ``(coeff=2, name="oxoglutarate")`` + silently — corrupting the stoichiometry. The resolver now prefers the full + token when it matches an existing metabolite name. + """ + model.add_metabolites([ + cobra.Metabolite("akg_c", name="2 oxoglutarate", compartment="c"), + ]) + (rxn,) = add_reactions_from_equations( + model, + [{"id": "R1", "equation": "ATP + 2 oxoglutarate --> ADP"}], + mets_by="name", + compartment="c", + ) + assert rxn.get_coefficient("akg_c") == -1.0 # not -2.0 + assert rxn.get_coefficient("atp_c") == -1.0 + + +def test_name_mode_coefficient_still_works_without_collision(model): + """If the full token doesn't match anything, fall back to coefficient split.""" + (rxn,) = add_reactions_from_equations( + model, + [{"id": "R1", "equation": "2 ATP + H2O --> ADP + phosphate"}], + mets_by="name", + compartment="c", + ) + assert rxn.get_coefficient("atp_c") == -2.0 + + +# --- regression: empty-stoichiometry warning (known_issues.md A2) ---------- + +def test_empty_stoichiometry_warns(model): + """All-terms-cancel reaction warns instead of silently shipping an empty rxn.""" + with pytest.warns(UserWarning, match="no net metabolites"): + (rxn,) = add_reactions_from_equations( + model, [{"id": "R1", "equation": "atp_c --> atp_c"}] + ) + assert len(rxn.metabolites) == 0 + + +# --- regression: unknown-compartment warning (known_issues.md B2) ---------- + +def test_id_mode_unknown_compartment_warns(model): + """A typo'd compartment used to silently produce a one-met ghost compartment + in id mode (the name/[comp] path used to validate, id mode never did).""" + with pytest.warns(UserWarning, match="unregistered compartment 'cyto'"): + add_reactions_from_equations( + model, + [{"id": "R1", "equation": "atp_c --> amp_c"}], + compartment="cyto", # typo for 'c' + ) + + +def test_name_comp_unknown_compartment_warns(model): + """Same defensive check in the name[comp] path when allow_new_mets=True.""" + with pytest.warns(UserWarning, match="unregistered compartment 'mito'"): + add_reactions_from_equations( + model, + [{"id": "R1", "equation": "ATP[c] --> AMP[mito]"}], + mets_by="name", + ) diff --git a/tests/test_manipulation_change.py b/tests/test_manipulation_change.py new file mode 100644 index 0000000..8d54f58 --- /dev/null +++ b/tests/test_manipulation_change.py @@ -0,0 +1,93 @@ +"""Tests for raven_python.manipulation.change (changeRxns port).""" +import cobra +import pytest + +from raven_python.manipulation import add_reactions_from_equations, change_reaction_equations + + +@pytest.fixture +def model(): + m = cobra.Model("t") + m.add_metabolites( + [ + cobra.Metabolite("a_c", name="A", compartment="c"), + cobra.Metabolite("b_c", name="B", compartment="c"), + cobra.Metabolite("c_c", name="C", compartment="c"), + ] + ) + add_reactions_from_equations( + m, + [ + { + "id": "R1", + "equation": "a_c <=> b_c", + "name": "first", + "bounds": (-30, 70), + "gene_reaction_rule": "G1 or G2", + "subsystem": "sub", + }, + {"id": "R2", "equation": "a_c --> c_c"}, + ], + ) + return m + + +def test_changes_stoichiometry(model): + (rxn,) = change_reaction_equations(model, {"R1": "a_c --> 2 c_c"}) + assert rxn.id == "R1" + assert {m.id: rxn.get_coefficient(m.id) for m in rxn.metabolites} == { + "a_c": -1.0, + "c_c": 2.0, + } + + +def test_preserves_other_fields(model): + before = model.reactions.get_by_id("R1") + name, bounds, subsystem = before.name, before.bounds, before.subsystem + genes = {g.id for g in before.genes} + + change_reaction_equations(model, {"R1": "a_c --> c_c"}) + + after = model.reactions.get_by_id("R1") + assert after.name == name + assert after.bounds == bounds # bounds untouched, per RAVEN + assert after.subsystem == subsystem + assert {g.id for g in after.genes} == genes + + +def test_preserves_reaction_order(model): + order_before = [r.id for r in model.reactions] + change_reaction_equations(model, {"R1": "b_c --> c_c"}) + assert [r.id for r in model.reactions] == order_before + + +def test_bounds_not_changed_by_arrow(model): + # R1 starts reversible (-30, 70); a --> arrow must NOT make it irreversible. + change_reaction_equations(model, {"R1": "a_c --> b_c"}) + assert model.reactions.get_by_id("R1").bounds == (-30, 70) + + +def test_name_mode(model): + (rxn,) = change_reaction_equations( + model, {"R2": "A --> C"}, mets_by="name", compartment="c" + ) + assert {m.id for m in rxn.metabolites} == {"a_c", "c_c"} + + +def test_can_introduce_new_met(model): + change_reaction_equations( + model, {"R2": "a_c --> d_c"}, compartment="c" + ) + assert "d_c" in model.metabolites + assert model.reactions.get_by_id("R2").get_coefficient("d_c") == 1.0 + + +def test_unknown_reaction_errors(model): + with pytest.raises(ValueError, match="not found"): + change_reaction_equations(model, {"NOPE": "a_c --> b_c"}) + + +def test_multiple_reactions(model): + changed = change_reaction_equations(model, {"R1": "a_c --> c_c", "R2": "b_c --> c_c"}) + assert [r.id for r in changed] == ["R1", "R2"] + assert model.reactions.get_by_id("R2").get_coefficient("b_c") == -1.0 diff --git a/tests/test_manipulation_compartments.py b/tests/test_manipulation_compartments.py new file mode 100644 index 0000000..4d3fb3b --- /dev/null +++ b/tests/test_manipulation_compartments.py @@ -0,0 +1,139 @@ +"""Tests for manipulation/compartments.py — merge_compartments + copy_to_compartment.""" +from __future__ import annotations + +import cobra +import pytest + +from raven_python.manipulation.compartments import copy_to_compartment, merge_compartments + + +def _two_compartment_model() -> cobra.Model: + """A_c → B_c, A_m → B_m, and a transport A_c ↔ A_m. Multi-compartment toy.""" + m = cobra.Model("toy") + A_c = cobra.Metabolite("A_c", name="A", compartment="c") + A_m = cobra.Metabolite("A_m", name="A", compartment="m") + B_c = cobra.Metabolite("B_c", name="B", compartment="c") + B_m = cobra.Metabolite("B_m", name="B", compartment="m") + m.add_metabolites([A_c, A_m, B_c, B_m]) + + def rxn(rid, lb, ub, mets, gpr=None): + r = cobra.Reaction(rid, lower_bound=lb, upper_bound=ub) + r.add_metabolites(mets) + if gpr: + r.gene_reaction_rule = gpr + return r + m.add_reactions([rxn("r_c", 0, 1000, {A_c: -1, B_c: 1}, "g1"), + rxn("r_m", 0, 1000, {A_m: -1, B_m: 1}, "g2"), + rxn("tr_A", -1000, 1000, {A_c: -1, A_m: 1})]) + return m + + +# ----------------------------------------------------------------- merge_compartments + +def test_merge_compartments_collapses_to_one(): + """A_c + A_m → A; B_c + B_m → B; transport A_c↔A_m self-cancels and is dropped.""" + m = _two_compartment_model() + merged, deleted, dupes = merge_compartments(m) + # Only the base ids survive. + assert {x.id for x in merged.metabolites} == {"A", "B"} + # The transport reaction collapsed (A → A) and was deleted. + assert "tr_A" in deleted + # r_c and r_m are now both A → B; one of them gets deduplicated. + surviving = {r.id for r in merged.reactions} + assert len(surviving & {"r_c", "r_m"}) == 1 + assert (set(dupes) | (surviving & {"r_c", "r_m"})) == {"r_c", "r_m"} + + +def test_merge_compartments_preserves_gpr_and_subsystem(): + m = _two_compartment_model() + m.reactions.r_c.subsystem = "carbo" + merged, _, _ = merge_compartments(m) + survivor = next(r for r in merged.reactions if r.id in {"r_c", "r_m"}) + # The survivor keeps its gene rule + subsystem (cobra may sometimes lose them + # through copy; we set them explicitly). + assert survivor.gene_reaction_rule in {"g1", "g2"} + if survivor.id == "r_c": + assert survivor.subsystem == "carbo" + + +def test_merge_compartments_keeps_single_met_reactions_when_asked(): + """drop_single_metabolite_reactions=False keeps the collapsed transport (now A → A, + which is empty stoichiometry after net-cancellation — still dropped, but the *one-met* + case is the more interesting one). Use a uniport pattern to exercise it.""" + m = cobra.Model("uniport") + A_c = cobra.Metabolite("A_c", name="A", compartment="c") + A_m = cobra.Metabolite("A_m", name="A", compartment="m") + H_c = cobra.Metabolite("H_c", name="H", compartment="c") + m.add_metabolites([A_c, A_m, H_c]) + # H+ symport: A_c + H_c → A_m. After merge: A + H → A → leaves H. + sym = cobra.Reaction("sym", lower_bound=0, upper_bound=1000) + sym.add_metabolites({A_c: -1, H_c: -1, A_m: 1}) + m.add_reactions([sym]) + merged_drop, deleted_drop, _ = merge_compartments(m, drop_single_metabolite_reactions=True) + assert "sym" in deleted_drop + merged_keep, deleted_keep, _ = merge_compartments(m, drop_single_metabolite_reactions=False) + # With keep, sym survives as a one-met reaction (consumes H). + assert "sym" not in deleted_keep + assert "sym" in {r.id for r in merged_keep.reactions} + + +def test_merge_compartments_deduplicate_off_keeps_both(): + m = _two_compartment_model() + merged, _, dupes = merge_compartments(m, deduplicate_reactions=False) + assert dupes == [] + assert {"r_c", "r_m"} <= {r.id for r in merged.reactions} + + +# ----------------------------------------------------------------- copy_to_compartment + +def test_copy_to_compartment_basic(): + """Copy r_c into 'p' (peroxisome): a new reaction r_c_p with metabolites in p.""" + m = _two_compartment_model() + out, new_rxns, new_mets = copy_to_compartment(m, ["r_c"], "p", + target_compartment_name="peroxisome") + assert "r_c_p" in [r.id for r in out.reactions] + new_r = out.reactions.r_c_p + assert {x.compartment for x in new_r.metabolites} == {"p"} + assert "A_p" in [x.id for x in out.metabolites] + assert "B_p" in [x.id for x in out.metabolites] + assert new_rxns == ["r_c_p"] + assert set(new_mets) == {"A_p", "B_p"} + # Original still there. + assert "r_c" in [r.id for r in out.reactions] + + +def test_copy_to_compartment_preserves_gpr_and_bounds(): + m = _two_compartment_model() + out, _, _ = copy_to_compartment(m, ["r_c"], "p") + new_r = out.reactions.r_c_p + assert new_r.gene_reaction_rule == "g1" + assert new_r.lower_bound == 0 and new_r.upper_bound == 1000 + + +def test_copy_to_compartment_delete_original_is_a_move(): + m = _two_compartment_model() + out, _, _ = copy_to_compartment(m, ["r_c"], "p", delete_original=True) + assert "r_c" not in [r.id for r in out.reactions] + assert "r_c_p" in [r.id for r in out.reactions] + + +def test_copy_to_compartment_idempotent(): + """Calling twice doesn't add the reaction twice.""" + m = _two_compartment_model() + out, _, _ = copy_to_compartment(m, ["r_c"], "p") + out2, new_rxns, _ = copy_to_compartment(out, ["r_c"], "p") + assert new_rxns == [] # nothing added on second call + assert len([r for r in out2.reactions if r.id == "r_c_p"]) == 1 + + +def test_copy_to_compartment_unknown_reaction_raises(): + m = _two_compartment_model() + with pytest.raises(ValueError, match="not in model"): + copy_to_compartment(m, ["does_not_exist"], "p") + + +def test_copy_to_compartment_custom_suffix(): + m = _two_compartment_model() + out, new_rxns, _ = copy_to_compartment(m, ["r_c"], "p", id_suffix="copy1") + assert new_rxns == ["r_c_copy1"] + assert "A_copy1" in [x.id for x in out.metabolites] diff --git a/tests/test_manipulation_expand.py b/tests/test_manipulation_expand.py new file mode 100644 index 0000000..08cd2f2 --- /dev/null +++ b/tests/test_manipulation_expand.py @@ -0,0 +1,288 @@ +"""Tests for expand_model (RAVEN expandModel.m) — splitting isozymes into reactions. + +Adopted from geckopy's tests/test_expand.py. +""" +import cobra + +from raven_python.manipulation import expand_model +from raven_python.manipulation.expand import _gpr_to_dnf + +# --------------------------------------------------------------------------- # +# DNF conversion (internal helper, worth testing directly) +# --------------------------------------------------------------------------- # + +def _dnf_from_gpr_string(gpr_str: str) -> list[list[str]]: + from cobra.core.gene import GPR + + gpr = GPR.from_string(gpr_str) + return _gpr_to_dnf(gpr) + + +def test_dnf_empty_gpr(): + assert _dnf_from_gpr_string("") == [] + + +def test_dnf_single_gene(): + assert _dnf_from_gpr_string("g1") == [["g1"]] + + +def test_dnf_simple_and(): + assert _dnf_from_gpr_string("g1 and g2") == [["g1", "g2"]] + + +def test_dnf_simple_or(): + assert _dnf_from_gpr_string("g1 or g2") == [["g1"], ["g2"]] + + +def test_dnf_or_of_ands(): + assert _dnf_from_gpr_string("(g1 and g2) or (g3 and g4)") == [ + ["g1", "g2"], + ["g3", "g4"], + ] + + +def test_dnf_distributes_and_over_or(): + result = _dnf_from_gpr_string("g1 and (g2 or g3)") + assert result == [["g1", "g2"], ["g1", "g3"]] + + +def test_dnf_triple_or(): + assert _dnf_from_gpr_string("g1 or g2 or g3") == [ + ["g1"], ["g2"], ["g3"], + ] + + +def test_dnf_preserves_gene_order_within_clause(): + result = _dnf_from_gpr_string("g3 and g1 and g2") + assert result == [["g3", "g1", "g2"]] + + +# --------------------------------------------------------------------------- # +# expand_model +# --------------------------------------------------------------------------- # + +def _build_model( + reactions: list[tuple[str, dict[str, float], float, float, str]], +) -> cobra.Model: + """Build from (rxn_id, {met_id: coef}, lb, ub, gpr) tuples.""" + model = cobra.Model("test") + mets: dict[str, cobra.Metabolite] = {} + for _, stoich, _, _, _ in reactions: + for met_id in stoich: + if met_id not in mets: + mets[met_id] = cobra.Metabolite(met_id, compartment="c") + + for rxn_id, stoich, lb, ub, gpr in reactions: + rxn = cobra.Reaction(rxn_id) + rxn.lower_bound = lb + rxn.upper_bound = ub + rxn.add_metabolites({mets[m]: c for m, c in stoich.items()}) + if gpr: + rxn.gene_reaction_rule = gpr + model.add_reactions([rxn]) + return model + + +def test_does_not_expand_reaction_without_gpr(): + model = _build_model([("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "")]) + added = expand_model(model) + assert added == [] + assert "r1" in {r.id for r in model.reactions} + + +def test_does_not_expand_single_and_clause(): + model = _build_model([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 and g2"), + ]) + added = expand_model(model) + assert added == [] + r1 = model.reactions.get_by_id("r1") + assert r1.gene_reaction_rule == "g1 and g2" + + +def test_does_not_expand_single_gene(): + model = _build_model([("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1")]) + added = expand_model(model) + assert added == [] + assert model.reactions.get_by_id("r1").gene_reaction_rule == "g1" + + +def test_splits_simple_or_into_two_reactions(): + model = _build_model([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"), + ]) + added = expand_model(model) + + assert added == ["r1_EXP_1", "r1_EXP_2"] + rxn_ids = {r.id for r in model.reactions} + assert "r1" not in rxn_ids + assert "r1_EXP_1" in rxn_ids + assert "r1_EXP_2" in rxn_ids + + assert model.reactions.get_by_id("r1_EXP_1").gene_reaction_rule == "g1" + assert model.reactions.get_by_id("r1_EXP_2").gene_reaction_rule == "g2" + + +def test_splits_or_of_ands(): + model = _build_model([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, + "(g1 and g2) or (g3 and g4)"), + ]) + added = expand_model(model) + + assert added == ["r1_EXP_1", "r1_EXP_2"] + assert model.reactions.get_by_id("r1_EXP_1").gene_reaction_rule == "g1 and g2" + assert model.reactions.get_by_id("r1_EXP_2").gene_reaction_rule == "g3 and g4" + + +def test_distributes_and_over_or(): + model = _build_model([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, + "g1 and (g2 or g3)"), + ]) + added = expand_model(model) + + assert added == ["r1_EXP_1", "r1_EXP_2"] + assert model.reactions.get_by_id("r1_EXP_1").gene_reaction_rule == "g1 and g2" + assert model.reactions.get_by_id("r1_EXP_2").gene_reaction_rule == "g1 and g3" + + +def test_expanded_reactions_inherit_stoichiometry_and_bounds(): + model = _build_model([ + ("r1", {"A": -1.0, "B": 2.0}, -500.0, 1500.0, "g1 or g2"), + ]) + expand_model(model) + + for suffix in ("_EXP_1", "_EXP_2"): + rxn = model.reactions.get_by_id(f"r1{suffix}") + assert rxn.bounds == (-500.0, 1500.0) + stoich = {m.id: c for m, c in rxn.metabolites.items()} + assert stoich == {"A": -1.0, "B": 2.0} + + +def test_expanded_reactions_inherit_name_and_subsystem(): + model = _build_model([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"), + ]) + r1 = model.reactions.get_by_id("r1") + r1.name = "an isozyme-catalyzed reaction" + r1.subsystem = "central metabolism" + + expand_model(model) + + for suffix in ("_EXP_1", "_EXP_2"): + rxn = model.reactions.get_by_id(f"r1{suffix}") + assert rxn.name == "an isozyme-catalyzed reaction" + assert rxn.subsystem == "central metabolism" + + +def test_multiple_reactions_expand_independently(): + model = _build_model([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"), + ("r2", {"B": -1.0, "C": 1.0}, 0.0, 1000.0, "g3 and g4"), + ("r3", {"C": -1.0, "D": 1.0}, 0.0, 1000.0, + "(g5 and g6) or g7 or (g8 and g9)"), + ]) + added = expand_model(model) + + assert added == sorted([ + "r1_EXP_1", "r1_EXP_2", + "r3_EXP_1", "r3_EXP_2", "r3_EXP_3", + ]) + + rxn_ids = {r.id for r in model.reactions} + assert "r2" in rxn_ids + assert "r1" not in rxn_ids + assert "r3" not in rxn_ids + + assert model.reactions.get_by_id("r2").gene_reaction_rule == "g3 and g4" + assert model.reactions.get_by_id("r3_EXP_1").gene_reaction_rule == "g5 and g6" + assert model.reactions.get_by_id("r3_EXP_2").gene_reaction_rule == "g7" + assert model.reactions.get_by_id("r3_EXP_3").gene_reaction_rule == "g8 and g9" + + +def test_expanded_reaction_has_correct_gene_set(): + model = _build_model([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, + "(g1 and g2) or (g3 and g4)"), + ]) + expand_model(model) + + r1_1 = model.reactions.get_by_id("r1_EXP_1") + assert {g.id for g in r1_1.genes} == {"g1", "g2"} + + r1_2 = model.reactions.get_by_id("r1_EXP_2") + assert {g.id for g in r1_2.genes} == {"g3", "g4"} + + +def test_expansion_is_idempotent_in_the_no_op_sense(): + model = _build_model([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"), + ("r2", {"B": -1.0, "C": 1.0}, 0.0, 1000.0, "g3 and g4"), + ]) + expand_model(model) + ids_before = {r.id for r in model.reactions} + + second = expand_model(model) + assert second == [] + + ids_after = {r.id for r in model.reactions} + assert ids_after == ids_before + + +def test_empty_model_is_unchanged(): + model = cobra.Model("empty") + assert expand_model(model) == [] + + +# --------------------------------------------------------------------------- # +# Annotation and notes propagation +# --------------------------------------------------------------------------- # + +def test_expanded_reactions_inherit_annotation_and_notes(): + model = _build_model([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"), + ]) + r1 = model.reactions.get_by_id("r1") + r1.annotation["ec-code"] = "1.2.3.4" + r1.annotation["sbo"] = "SBO:0000176" + r1.notes["custom"] = "hello" + + expand_model(model) + + for suffix in ("_EXP_1", "_EXP_2"): + rxn = model.reactions.get_by_id(f"r1{suffix}") + assert rxn.annotation["ec-code"] == "1.2.3.4" + assert rxn.annotation["sbo"] == "SBO:0000176" + assert rxn.notes["custom"] == "hello" + + +def test_expanded_reaction_annotation_is_independent_of_parent(): + """Mutating one expanded reaction's annotation must not affect siblings.""" + model = _build_model([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"), + ]) + model.reactions.get_by_id("r1").annotation["ec-code"] = ["1.2.3.4"] + + expand_model(model) + + r1_1 = model.reactions.get_by_id("r1_EXP_1") + r1_2 = model.reactions.get_by_id("r1_EXP_2") + r1_1.annotation["ec-code"].append("9.9.9.9") + assert r1_2.annotation["ec-code"] == ["1.2.3.4"] + + +def test_objective_coefficient_preserved_on_expansion(): + """An expanded reaction's isozyme copies retain the original objective coefficient.""" + m = cobra.Model("o") + a, b = (cobra.Metabolite(x, compartment="c") for x in "ab") + m.add_metabolites([a, b]) + r = cobra.Reaction("r1", lower_bound=0, upper_bound=1000) + r.add_metabolites({a: -1, b: 1}) + r.gene_reaction_rule = "g1 or g2" + m.add_reactions([r]) + m.objective = "r1" # objective on the soon-to-be-expanded reaction + + expand_model(m) + coeffs = {rx.id: rx.objective_coefficient for rx in m.reactions} + assert coeffs == {"r1_EXP_1": 1.0, "r1_EXP_2": 1.0} # objective survives on both copies diff --git a/tests/test_manipulation_irreversible.py b/tests/test_manipulation_irreversible.py new file mode 100644 index 0000000..e211fa3 --- /dev/null +++ b/tests/test_manipulation_irreversible.py @@ -0,0 +1,144 @@ +"""Tests for convert_to_irreversible (RAVEN convertToIrrev.m). + +Adopted from geckopy's tests/test_preprocess.py (the convert_to_irreversible subset). +Exchange reactions are excluded from the split, matching MATLAB behavior. +""" +import cobra + +from raven_python.manipulation import convert_to_irreversible + + +def _build_model_with_bounds( + reactions: list[tuple[str, dict[str, float], float, float]], +) -> cobra.Model: + """Build from (rxn_id, {met_id: coef}, lb, ub) tuples.""" + model = cobra.Model("test") + mets: dict[str, cobra.Metabolite] = {} + for _, stoich, _, _ in reactions: + for met_id in stoich: + if met_id not in mets: + mets[met_id] = cobra.Metabolite(met_id, compartment="c") + + for rxn_id, stoich, lb, ub in reactions: + rxn = cobra.Reaction(rxn_id) + rxn.lower_bound = lb + rxn.upper_bound = ub + rxn.add_metabolites({mets[m]: c for m, c in stoich.items()}) + model.add_reactions([rxn]) + return model + + +def test_splits_single_reversible_non_exchange(): + model = _build_model_with_bounds([ + ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0), + ]) + + added = convert_to_irreversible(model) + assert added == ["r1_REV"] + + fwd = model.reactions.get_by_id("r1") + rev = model.reactions.get_by_id("r1_REV") + + assert fwd.bounds == (0.0, 1000.0) + assert {m.id: c for m, c in fwd.metabolites.items()} == {"A": -1.0, "B": 1.0} + + assert rev.bounds == (0.0, 500.0) + assert {m.id: c for m, c in rev.metabolites.items()} == {"A": 1.0, "B": -1.0} + + +def test_does_not_split_forward_only_reaction(): + model = _build_model_with_bounds([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0), + ]) + added = convert_to_irreversible(model) + assert added == [] + assert "r1_REV" not in {r.id for r in model.reactions} + + +def test_does_not_split_exchange_reaction_even_if_reversible(): + """Exchange reactions (one metabolite) are explicitly excluded from + the irreversibility step in MATLAB, regardless of bounds.""" + model = _build_model_with_bounds([ + ("EX_A", {"A": -1.0}, -1000.0, 1000.0), + ]) + added = convert_to_irreversible(model) + assert added == [] + ex = model.reactions.get_by_id("EX_A") + assert ex.bounds == (-1000.0, 1000.0) + + +def test_splits_multiple_mixed_reactions(): + model = _build_model_with_bounds([ + ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0), # split + ("r2", {"B": -2.0, "C": 3.0}, 0.0, 1000.0), # forward only + ("EX_A", {"A": -1.0}, -1000.0, 1000.0), # exchange + ("r3", {"C": -1.0, "D": 1.0}, -200.0, 200.0), # split + ]) + + added = convert_to_irreversible(model) + assert added == ["r1_REV", "r3_REV"] + + assert model.reactions.get_by_id("r1").bounds == (0.0, 1000.0) + assert model.reactions.get_by_id("r1_REV").bounds == (0.0, 500.0) + assert model.reactions.get_by_id("r2").bounds == (0.0, 1000.0) + assert model.reactions.get_by_id("EX_A").bounds == (-1000.0, 1000.0) + assert model.reactions.get_by_id("r3").bounds == (0.0, 200.0) + assert model.reactions.get_by_id("r3_REV").bounds == (0.0, 200.0) + + +def test_reverse_reaction_inherits_gpr(): + model = _build_model_with_bounds([ + ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0), + ]) + model.reactions.get_by_id("r1").gene_reaction_rule = "g1 and g2" + + convert_to_irreversible(model) + + rev = model.reactions.get_by_id("r1_REV") + assert rev.gene_reaction_rule == "g1 and g2" + assert {g.id for g in rev.genes} == {"g1", "g2"} + + +def test_forward_reaction_lb_is_clamped_to_zero(): + """After splitting, the original reaction should have lb = 0, + which is what MATLAB's convertToIrrev does.""" + model = _build_model_with_bounds([ + ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0), + ]) + convert_to_irreversible(model) + assert model.reactions.get_by_id("r1").lower_bound == 0.0 + + +def test_no_reverse_reaction_has_negative_bound(): + """After conversion, no non-exchange reaction may carry negative flux.""" + model = _build_model_with_bounds([ + ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0), + ("r2", {"B": -1.0, "C": 1.0}, -1000.0, 0.0), # blocked reverse + ("EX_A", {"A": -1.0}, -1000.0, 1000.0), + ]) + convert_to_irreversible(model) + for rxn in model.reactions: + if rxn.boundary: + continue + assert rxn.lower_bound >= 0, f"{rxn.id} still has lb < 0" + + +def test_returns_empty_list_when_nothing_to_split(): + model = _build_model_with_bounds([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0), + ("EX_A", {"A": -1.0}, -1000.0, 1000.0), + ]) + assert convert_to_irreversible(model) == [] + + +def test_conversion_is_idempotent_after_first_pass(): + """Running convert_to_irreversible twice should not create + `_REV_REV` reactions, because the first pass already clamped + all non-exchange lb to 0.""" + model = _build_model_with_bounds([ + ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0), + ]) + convert_to_irreversible(model) + second = convert_to_irreversible(model) + assert second == [] + assert "r1_REV_REV" not in {r.id for r in model.reactions} diff --git a/tests/test_manipulation_merge.py b/tests/test_manipulation_merge.py new file mode 100644 index 0000000..a430f6e --- /dev/null +++ b/tests/test_manipulation_merge.py @@ -0,0 +1,136 @@ +"""Tests for merge_models (mergeModels port).""" +import cobra +import pytest + +from raven_python.manipulation import add_reactions_from_equations, merge_models + + +def _model(mid, mets, reactions): + m = cobra.Model(mid) + m.add_metabolites(mets) + add_reactions_from_equations(m, reactions) + return m + + +@pytest.fixture +def model_a(): + return _model( + "A", + [ + cobra.Metabolite("glc_c", name="Glucose", compartment="c"), + cobra.Metabolite("g6p_c", name="G6P", compartment="c"), + ], + [{"id": "HEX", "equation": "glc_c --> g6p_c", "gene_reaction_rule": "GA"}], + ) + + +@pytest.fixture +def model_b(): + # same Glucose[c] compound but a DIFFERENT id + return _model( + "B", + [ + cobra.Metabolite("glucose_c", name="Glucose", compartment="c"), + cobra.Metabolite("lac_c", name="Lactate", compartment="c"), + ], + [{"id": "LDH", "equation": "glucose_c --> lac_c", "gene_reaction_rule": "GB"}], + ) + + +def test_unifies_metabolites_by_name_comp(model_a, model_b): + merged = merge_models([model_a, model_b]) + glucoses = [m for m in merged.metabolites if m.name == "Glucose" and m.compartment == "c"] + assert len(glucoses) == 1 # glc_c and glucose_c unified + # both reactions reference the same merged Glucose object + hex_glc = [m for m in merged.reactions.get_by_id("HEX").metabolites if m.name == "Glucose"][0] + ldh_glc = [m for m in merged.reactions.get_by_id("LDH").metabolites if m.name == "Glucose"][0] + assert hex_glc is ldh_glc + + +def test_match_by_id_keeps_distinct(model_a, model_b): + merged = merge_models([model_a, model_b], match_by="id") + glucoses = [m for m in merged.metabolites if m.name == "Glucose"] + assert len(glucoses) == 2 # glc_c and glucose_c are distinct by id + + +def test_all_reactions_kept(model_a, model_b): + merged = merge_models([model_a, model_b]) + assert {"HEX", "LDH"} <= {r.id for r in merged.reactions} + + +def test_reaction_id_collision_renamed(model_a): + # two models with the same reaction id but different chemistry + other = _model( + "B", + [cobra.Metabolite("glc_c", name="Glucose", compartment="c"), + cobra.Metabolite("x_c", name="X", compartment="c")], + [{"id": "HEX", "equation": "glc_c --> x_c"}], + ) + merged = merge_models([model_a, other]) + assert "HEX" in {r.id for r in merged.reactions} + assert "HEX_B" in {r.id for r in merged.reactions} # renamed with source id + + +def test_genes_merged(model_a, model_b): + merged = merge_models([model_a, model_b]) + assert {"GA", "GB"} <= {g.id for g in merged.genes} + + +def test_provenance_recorded(model_a, model_b): + merged = merge_models([model_a, model_b]) + assert merged.reactions.get_by_id("HEX").notes["origin"] == "A" + assert merged.reactions.get_by_id("LDH").notes["origin"] == "B" + assert merged.genes.get_by_id("GA").notes["origin"] == "A" + + +def test_compartments_preserved(model_a): + model_a.compartments = {"c": "cytoplasm"} + merged = merge_models([model_a, model_a.copy()]) + assert merged.compartments.get("c") == "cytoplasm" + + +def test_single_model_returns_copy(model_a): + merged = merge_models([model_a]) + assert merged is not model_a + assert {r.id for r in merged.reactions} == {r.id for r in model_a.reactions} + + +def test_three_models(model_a, model_b): + c = _model("C", [cobra.Metabolite("co2_c", name="CO2", compartment="c")], + [{"id": "SINK", "equation": "co2_c -->"}]) + merged = merge_models([model_a, model_b, c]) + assert {"HEX", "LDH", "SINK"} <= {r.id for r in merged.reactions} + + +def test_bad_match_by(model_a, model_b): + with pytest.raises(ValueError, match="match_by"): + merge_models([model_a, model_b], match_by="oops") + + +# --- regression: formula/charge conflict (known_issues.md B1) -------------- + +def test_formula_conflict_warns(): + """Two models sharing a name[comp] but with different formulas warn instead + of silently keeping the first.""" + a = _model("A", + [cobra.Metabolite("g1", name="Glucose", formula="C6H12O6", compartment="c")], + [{"id": "EX_A", "equation": "g1 -->"}]) + b = _model("B", + [cobra.Metabolite("g2", name="Glucose", formula="C6H12O7", compartment="c")], + [{"id": "EX_B", "equation": "g2 -->"}]) + with pytest.warns(UserWarning, match="different formulas"): + merged = merge_models([a, b]) + # The merge still picks the first-seen — the test asserts the warning fired + # and the model survives. + assert "EX_A" in merged.reactions and "EX_B" in merged.reactions + + +def test_charge_conflict_warns(): + a = _model("A", + [cobra.Metabolite("g1", name="Glucose", formula="C6H12O6", charge=0, compartment="c")], + [{"id": "EX_A", "equation": "g1 -->"}]) + b = _model("B", + [cobra.Metabolite("g2", name="Glucose", formula="C6H12O6", charge=-1, compartment="c")], + [{"id": "EX_B", "equation": "g2 -->"}]) + with pytest.warns(UserWarning, match="different charges"): + merge_models([a, b]) diff --git a/tests/test_manipulation_remove.py b/tests/test_manipulation_remove.py new file mode 100644 index 0000000..2b659b9 --- /dev/null +++ b/tests/test_manipulation_remove.py @@ -0,0 +1,97 @@ +"""Tests for raven_python.manipulation.remove (removeMets/removeGenes ports).""" +import cobra +import pytest + +from raven_python.manipulation import ( + add_reactions_from_equations, + remove_genes, + remove_metabolites, +) + + +@pytest.fixture +def model(): + m = cobra.Model("t") + m.add_metabolites( + [ + cobra.Metabolite("atp_c", name="ATP", compartment="c"), + cobra.Metabolite("atp_m", name="ATP", compartment="m"), + cobra.Metabolite("adp_c", name="ADP", compartment="c"), + cobra.Metabolite("x_c", name="X", compartment="c"), + ] + ) + add_reactions_from_equations( + m, + [ + {"id": "R1", "equation": "atp_c --> adp_c", "gene_reaction_rule": "G1 and G2"}, + {"id": "R2", "equation": "atp_c --> x_c", "gene_reaction_rule": "G3 or G4"}, + {"id": "R3", "equation": "atp_m --> adp_c"}, # no GPR (spontaneous) + ], + ) + return m + + +# --- remove_metabolites ---------------------------------------------------- + +def test_remove_metabolites_by_id(model): + remove_metabolites(model, ["x_c"]) + assert "x_c" not in model.metabolites + # reaction kept, just lost the metabolite + assert "R2" in model.reactions + + +def test_remove_metabolites_by_name_across_compartments(model): + # "ATP" exists in c and m; by_name removes both at once. + remove_metabolites(model, ["ATP"], by_name=True) + assert "atp_c" not in model.metabolites + assert "atp_m" not in model.metabolites + assert "adp_c" in model.metabolites + + +def test_remove_metabolites_destructive(model): + remove_metabolites(model, ["adp_c"], destructive=True) + # R1 and R3 both produced adp_c -> removed + assert "adp_c" not in model.metabolites + assert "R1" not in model.reactions and "R3" not in model.reactions + + +# --- remove_genes ---------------------------------------------------------- + +def test_remove_genes_remove_mode(model): + blocked = remove_genes(model, ["G1"], blocked_reactions="remove") + # R1 = "G1 and G2": removing G1 breaks the complex -> blocked -> removed + assert blocked == ["R1"] + assert "R1" not in model.reactions + assert "R2" in model.reactions # OR rule unaffected + + +def test_remove_genes_constrain_mode(model): + blocked = remove_genes(model, ["G1"], blocked_reactions="constrain") + assert blocked == ["R1"] + r1 = model.reactions.get_by_id("R1") + assert r1.bounds == (0, 0) # kept but constrained, per RAVEN default + assert r1.gene_reaction_rule == "" + + +def test_remove_genes_keep_mode(model): + blocked = remove_genes(model, ["G1"], blocked_reactions="keep") + assert blocked == ["R1"] + r1 = model.reactions.get_by_id("R1") + assert r1.gene_reaction_rule == "" + assert r1.bounds != (0, 0) # left untouched + + +def test_remove_genes_or_rule_not_blocked(model): + blocked = remove_genes(model, ["G3"], blocked_reactions="remove") + # R2 = "G3 or G4": removing G3 leaves G4 -> not blocked + assert blocked == [] + assert model.reactions.get_by_id("R2").gene_reaction_rule == "G4" + + +def test_remove_genes_absent_gene_is_noop(model): + assert remove_genes(model, ["NOPE"]) == [] + + +def test_remove_genes_bad_policy(model): + with pytest.raises(ValueError, match="blocked_reactions"): + remove_genes(model, ["G1"], blocked_reactions="explode") diff --git a/tests/test_manipulation_simplify.py b/tests/test_manipulation_simplify.py new file mode 100644 index 0000000..586a0c3 --- /dev/null +++ b/tests/test_manipulation_simplify.py @@ -0,0 +1,184 @@ +"""Tests for simplifyModel reduction modes.""" +import cobra +import pytest + +from raven_python.manipulation import ( + add_reactions_from_equations, + constrain_reversible_reactions, + group_linear_reactions, + remove_dead_end_reactions, + remove_duplicate_reactions, +) + +# --- remove_dead_end_reactions -------------------------------------------- + +def test_dead_end_removed(): + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b", "dead")]) + add_reactions_from_equations( + m, + [ + {"id": "R_in", "equation": " --> a"}, + {"id": "R1", "equation": "a --> b"}, + {"id": "R_out", "equation": "b --> "}, + {"id": "R_dead", "equation": "a --> dead"}, # 'dead' only produced + ], + ) + removed_rxns, removed_mets = remove_dead_end_reactions(m) + assert "R_dead" in removed_rxns + assert "dead" in removed_mets + # the productive path survives + assert {"R_in", "R1", "R_out"} <= {r.id for r in m.reactions} + + +def test_dead_end_respects_reserved(): + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "dead")]) + add_reactions_from_equations( + m, [{"id": "R_in", "equation": " --> a"}, {"id": "R_dead", "equation": "a --> dead"}] + ) + removed_rxns, _ = remove_dead_end_reactions(m, reserved=["R_dead"]) + assert "R_dead" not in removed_rxns + assert "R_dead" in {r.id for r in m.reactions} + + +# --- remove_duplicate_reactions ------------------------------------------- + +def test_duplicates_removed(): + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b")]) + add_reactions_from_equations( + m, + [ + {"id": "R1", "equation": "a --> b", "bounds": (0, 1000)}, + {"id": "R2", "equation": "a --> b", "bounds": (0, 1000)}, # duplicate of R1 + {"id": "R3", "equation": "a --> b", "bounds": (0, 500)}, # different bounds + ], + ) + removed = remove_duplicate_reactions(m) + assert len(removed) == 1 # one of R1/R2 removed + assert {"R3"} <= {r.id for r in m.reactions} + assert sum(r.id in ("R1", "R2") for r in m.reactions) == 1 + + +def test_duplicates_keep_reserved(): + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b")]) + add_reactions_from_equations( + m, + [ + {"id": "R1", "equation": "a --> b", "bounds": (0, 1000)}, + {"id": "R2", "equation": "a --> b", "bounds": (0, 1000)}, + ], + ) + remove_duplicate_reactions(m, reserved=["R1"]) + assert "R1" in {r.id for r in m.reactions} # reserved one kept + + +# --- constrain_reversible_reactions --------------------------------------- + +def test_forward_only_reversible_constrained(): + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b")]) + add_reactions_from_equations( + m, + [ + {"id": "R_in", "equation": " --> a", "bounds": (0, 1000)}, + {"id": "R1", "equation": "a <=> b", "bounds": (-1000, 1000)}, # can only go fwd + {"id": "R_out", "equation": "b --> ", "bounds": (0, 1000)}, + ], + ) + changed = constrain_reversible_reactions(m) + assert "R1" in changed + assert m.reactions.get_by_id("R1").lower_bound == 0 # constrained to forward + + +def test_truly_reversible_unchanged(): + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b")]) + add_reactions_from_equations( + m, + [ + {"id": "R_in", "equation": " <=> a", "bounds": (-1000, 1000)}, + {"id": "R1", "equation": "a <=> b", "bounds": (-1000, 1000)}, + {"id": "R_out", "equation": "b <=> ", "bounds": (-1000, 1000)}, + ], + ) + changed = constrain_reversible_reactions(m) + assert "R1" not in changed # can go both ways + + +# --- group_linear_reactions ----------------------------------------------- + +def test_linear_chain_merged(): + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b", "c")]) + add_reactions_from_equations( + m, + [ + {"id": "R1", "equation": "a --> b"}, # b: single producer + {"id": "R2", "equation": "b --> c"}, # b: single consumer + ], + ) + n_before = len(m.reactions) + group_linear_reactions(m) + # b is eliminated; R1+R2 merged into one reaction a --> c + assert "b" not in m.metabolites + assert len(m.reactions) < n_before + + +def test_group_linear_discards_genes(): + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b", "c")]) + add_reactions_from_equations( + m, + [ + {"id": "R1", "equation": "a --> b", "gene_reaction_rule": "G1"}, + {"id": "R2", "equation": "b --> c", "gene_reaction_rule": "G2"}, + ], + ) + group_linear_reactions(m) + assert len(m.genes) == 0 + + +# --- regression: incremental merge collapses a long chain (known_issues.md D1) --- + +def test_group_linear_merges_long_chain_in_one_pass(): + """The incremental scan still flattens a 5-reaction linear chain — the + correctness property the original O(n²·m) restart-after-merge loop had.""" + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in "abcdef"]) + add_reactions_from_equations( + m, + [ + {"id": "R_in", "equation": " --> a"}, + {"id": "R1", "equation": "a --> b"}, + {"id": "R2", "equation": "b --> c"}, + {"id": "R3", "equation": "c --> d"}, + {"id": "R4", "equation": "d --> e"}, + {"id": "R5", "equation": "e --> f"}, + {"id": "R_out", "equation": "f --> "}, + ], + ) + group_linear_reactions(m) + # All the chain's internal metabolites are gone. + assert {x for x in m.metabolites if x.id in {"b", "c", "d", "e"}} == set() + + +# --- regression: NaN FVA on infeasible model (known_issues.md C1) ---------- + +def test_constrain_reversible_raises_on_infeasible(): + """An infeasible model produces NaN FVA ranges; the old abs(NaN) < eps + check silently treated those as 'truly reversible'. Now raises.""" + m = cobra.Model("t") + a, b = (cobra.Metabolite(x, compartment="c") for x in ("a", "b")) + m.add_metabolites([a, b]) + # Force a contradiction: r requires production AND consumption of a, but + # nothing else produces a. + r = cobra.Reaction("r", lower_bound=-1, upper_bound=1) + r.add_metabolites({a: -1, b: 1}) + forced = cobra.Reaction("forced", lower_bound=5, upper_bound=10) # infeasible + forced.add_metabolites({a: -1}) + m.add_reactions([r, forced]) + with pytest.raises(RuntimeError, match="infeasible"): + constrain_reversible_reactions(m) diff --git a/tests/test_manipulation_transfer.py b/tests/test_manipulation_transfer.py new file mode 100644 index 0000000..61c2ac9 --- /dev/null +++ b/tests/test_manipulation_transfer.py @@ -0,0 +1,137 @@ +"""Tests for add_reactions_from_model (addRxnsGenesMets port).""" +import cobra +import pytest + +from raven_python.manipulation import add_reactions_from_equations, add_reactions_from_model + + +@pytest.fixture +def draft(): + m = cobra.Model("draft") + m.add_metabolites( + [cobra.Metabolite("glc_c", name="Glucose", formula="C6H12O6", compartment="c")] + ) + # an existing reaction so glc_c is in use and we have an id to test skipping + add_reactions_from_equations(m, [{"id": "R_existing", "equation": "glc_c <=>"}]) + return m + + +@pytest.fixture +def source(): + m = cobra.Model("source") + m.add_metabolites( + [ + # same name[comp] as draft's glc_c but a DIFFERENT id + cobra.Metabolite("glucose_c", name="Glucose", formula="C6H12O6", compartment="c"), + cobra.Metabolite("atp_c", name="ATP", formula="C10H16N5O13P3", charge=-4, compartment="c"), + cobra.Metabolite("g6p_c", name="G6P", formula="C6H13O9P", compartment="c"), + ] + ) + add_reactions_from_equations( + m, + [ + { + "id": "HEX", + "equation": "glucose_c + atp_c --> g6p_c", + "name": "hexokinase", + "bounds": (0, 1000), + "gene_reaction_rule": "G1", + "subsystem": "glycolysis", + }, + {"id": "R_existing", "equation": "glucose_c <=>"}, # id already in draft + ], + ) + return m + + +def test_metabolite_matched_by_name_comp_not_id(draft, source): + add_reactions_from_model(draft, source, "HEX") + hex_rxn = draft.reactions.get_by_id("HEX") + # Glucose reused from the draft (id glc_c), NOT the source's glucose_c + assert "glc_c" in {m.id for m in hex_rxn.metabolites} + assert "glucose_c" not in draft.metabolites + + +def test_new_metabolites_added_with_metadata(draft, source): + add_reactions_from_model(draft, source, "HEX") + assert "atp_c" in draft.metabolites and "g6p_c" in draft.metabolites + assert draft.metabolites.get_by_id("g6p_c").formula == "C6H13O9P" + assert draft.metabolites.get_by_id("atp_c").charge == -4 + + +def test_reaction_copied_with_bounds_and_name(draft, source): + (rxn,) = add_reactions_from_model(draft, source, "HEX") + assert rxn.id == "HEX" + assert rxn.name == "hexokinase" + assert rxn.bounds == (0, 1000) + assert rxn.subsystem == "glycolysis" + assert {m.id: rxn.get_coefficient(m.id) for m in rxn.metabolites} == { + "glc_c": -1.0, + "atp_c": -1.0, + "g6p_c": 1.0, + } + + +def test_genes_true_copies_gpr_and_creates_genes(draft, source): + add_reactions_from_model(draft, source, "HEX", genes=True) + assert draft.reactions.get_by_id("HEX").gene_reaction_rule == "G1" + assert "G1" in draft.genes + + +def test_genes_false_no_gpr(draft, source): + add_reactions_from_model(draft, source, "HEX", genes=False) + assert draft.reactions.get_by_id("HEX").gene_reaction_rule == "" + + +def test_genes_string_override(draft, source): + add_reactions_from_model(draft, source, "HEX", genes="G9 or G10") + assert draft.reactions.get_by_id("HEX").gene_reaction_rule == "G9 or G10" + + +def test_skips_already_present(draft, source): + added = add_reactions_from_model(draft, source, ["HEX", "R_existing"]) + assert [r.id for r in added] == ["HEX"] + + +def test_all_present_raises(draft, source): + with pytest.raises(ValueError, match="already in the model"): + add_reactions_from_model(draft, source, "R_existing") + + +def test_unknown_source_reaction_raises(draft, source): + with pytest.raises(ValueError, match="not found in the source model"): + add_reactions_from_model(draft, source, "NOPE") + + +def test_note_and_confidence_stored(draft, source): + (rxn,) = add_reactions_from_model(draft, source, "HEX", note="from KEGG", confidence=2) + assert rxn.notes["note"] == "from KEGG" + assert rxn.notes["confidence_score"] == 2 + + +# --- regression: intra-batch met-id minting collision (known_issues.md A3) --- + +def test_intra_batch_id_minting_unique(): + """Two source mets whose ids both collide with the draft and whose name[comp] + differs both get routed through new-id minting. The fix tracks ids minted in + the current batch so the two don't collapse to the same generated id.""" + draft = cobra.Model("draft") + draft.add_metabolites([ + cobra.Metabolite("atp_c", name="ATP-draft", compartment="c"), + cobra.Metabolite("adp_c", name="ADP-draft", compartment="c"), + ]) + source = cobra.Model("source") + source.add_metabolites([ + cobra.Metabolite("atp_c", name="ATP-source", compartment="c"), + cobra.Metabolite("adp_c", name="ADP-source", compartment="c"), + ]) + rxn = cobra.Reaction("R1", lower_bound=0, upper_bound=1000) + source.add_reactions([rxn]) + rxn.add_metabolites({ + source.metabolites.get_by_id("atp_c"): -1, + source.metabolites.get_by_id("adp_c"): 1, + }) + add_reactions_from_model(draft, source, "R1") + # Both source mets minted distinct ids (m1 and m2) — not a collision. + new_ids = sorted(m.id for m in draft.metabolites if m.id not in ("atp_c", "adp_c")) + assert len(new_ids) == 2 and len(set(new_ids)) == 2 diff --git a/tests/test_manipulation_transport.py b/tests/test_manipulation_transport.py new file mode 100644 index 0000000..e8fb2b6 --- /dev/null +++ b/tests/test_manipulation_transport.py @@ -0,0 +1,98 @@ +"""Tests for add_transport_reactions (addTransport port).""" +import cobra +import pytest + +from raven_python.manipulation import add_transport_reactions + + +@pytest.fixture +def model(): + m = cobra.Model("t") + m.compartments = {"c": "cytoplasm", "m": "mitochondrion", "e": "extracellular"} + m.add_metabolites( + [ + cobra.Metabolite("atp_c", name="ATP", formula="C10H16N5O13P3", charge=-4, compartment="c"), + cobra.Metabolite("h2o_c", name="H2O", formula="H2O", compartment="c"), + cobra.Metabolite("atp_m", name="ATP", compartment="m"), # exists in m + ] + ) + return m + + +def test_basic_transport_to_existing(model): + added = add_transport_reactions(model, "c", "m", ["ATP"]) + assert len(added) == 1 + rxn = added[0] + assert rxn.id == "tr_0001" + assert rxn.name == "ATP transport, cytoplasm-mitochondrion" + assert {m.id: rxn.get_coefficient(m.id) for m in rxn.metabolites} == { + "atp_c": -1.0, + "atp_m": 1.0, + } + assert rxn.reversibility is True + + +def test_only_to_existing_skips_missing(model): + # H2O is not in m; with only_to_existing (default) it's skipped + added = add_transport_reactions(model, "c", "m", ["ATP", "H2O"]) + assert [r.id for r in added] == ["tr_0001"] # only ATP + + +def test_creates_missing_target_metabolite(model): + added = add_transport_reactions( + model, "c", "m", ["H2O"], only_to_existing=False + ) + assert len(added) == 1 + new = [mt for mt in model.metabolites if mt.name == "H2O" and mt.compartment == "m"] + assert len(new) == 1 + assert new[0].formula == "H2O" # copied from source + + +def test_copies_formula_and_charge(model): + add_transport_reactions(model, "c", "e", ["ATP"], only_to_existing=False) + new = [mt for mt in model.metabolites if mt.name == "ATP" and mt.compartment == "e"][0] + assert new.formula == "C10H16N5O13P3" + assert new.charge == -4 + + +def test_irreversible(model): + (rxn,) = add_transport_reactions(model, "c", "m", ["ATP"], reversible=False) + assert rxn.lower_bound == 0 + assert rxn.reversibility is False + + +def test_default_all_metabolites_in_from(model): + # default metabolite_names = all in c (ATP, H2O); to m, only_to_existing -> only ATP + added = add_transport_reactions(model, "c", "m") + assert [r.id for r in added] == ["tr_0001"] + + +def test_multiple_target_compartments_and_sequential_ids(model): + added = add_transport_reactions( + model, "c", ["m", "e"], ["ATP"], only_to_existing=False + ) + assert [r.id for r in added] == ["tr_0001", "tr_0002"] + + +def test_unknown_compartment_raises(model): + with pytest.raises(ValueError, match="not in the model"): + add_transport_reactions(model, "x", "m", ["ATP"]) + + +def test_unknown_metabolite_raises(model): + with pytest.raises(ValueError, match="not found in compartment"): + add_transport_reactions(model, "c", "m", ["NOPE"]) + + +# --- regression: duplicate name in compartment (known_issues.md A4) -------- + +def test_duplicate_name_in_source_compartment_warns(model): + """Two source mets sharing a name in the same compartment warn instead of + silently collapsing — previously one was dropped from the lookup dict.""" + model.add_metabolites([ + cobra.Metabolite("h2o2_c", name="H2O", compartment="c"), # duplicate name + ]) + with pytest.warns(UserWarning, match="Multiple metabolites named 'H2O'"): + added = add_transport_reactions(model, "c", "m", ["H2O"], only_to_existing=False) + # Transport still works (uses the first match) — the warning is the signal. + assert len(added) == 1 diff --git a/tests/test_parameters.py b/tests/test_parameters.py new file mode 100644 index 0000000..c0ab06c --- /dev/null +++ b/tests/test_parameters.py @@ -0,0 +1,60 @@ +"""Tests for set_variance_bounds (the var mode of setParam).""" +import cobra +import pytest + +from raven_python.manipulation import add_reactions_from_equations, set_variance_bounds + + +@pytest.fixture +def model(): + m = cobra.Model("t") + m.add_metabolites( + [cobra.Metabolite("a_c", compartment="c"), cobra.Metabolite("b_c", compartment="c")] + ) + add_reactions_from_equations( + m, + [ + {"id": "R1", "equation": "a_c <=> b_c"}, + {"id": "R2", "equation": "a_c <=> b_c"}, + ], + ) + return m + + +def test_band_positive(model): + set_variance_bounds(model, "R1", 100, 5) # 97.5 .. 102.5 + lb, ub = model.reactions.get_by_id("R1").bounds + assert lb == pytest.approx(97.5) + assert ub == pytest.approx(102.5) + + +def test_band_negative_is_ordered(model): + set_variance_bounds(model, "R1", -100, 5) + lb, ub = model.reactions.get_by_id("R1").bounds + assert lb == pytest.approx(-102.5) + assert ub == pytest.approx(-97.5) + assert lb <= ub + + +def test_broadcast_scalar(model): + set_variance_bounds(model, ["R1", "R2"], 50, 10) + for rid in ("R1", "R2"): + lb, ub = model.reactions.get_by_id(rid).bounds + assert lb == pytest.approx(47.5) + assert ub == pytest.approx(52.5) + + +def test_per_reaction_values(model): + set_variance_bounds(model, ["R1", "R2"], [100, 200], 0) + assert model.reactions.get_by_id("R1").bounds == pytest.approx((100, 100)) + assert model.reactions.get_by_id("R2").bounds == pytest.approx((200, 200)) + + +def test_length_mismatch_raises(model): + with pytest.raises(ValueError, match="to match the reactions"): + set_variance_bounds(model, ["R1", "R2"], [1, 2, 3], 5) + + +def test_unknown_reaction_raises(model): + with pytest.raises(ValueError, match="not found"): + set_variance_bounds(model, "NOPE", 1, 5) From a1dc557ece5d83d38466e96d676eeec66f90b1bb Mon Sep 17 00:00:00 2001 From: Eduard Kerkhoven Date: Fri, 29 May 2026 22:50:05 +0200 Subject: [PATCH 3/3] Add binary + data resolvers for external tools and published artefacts --- docs/maintaining_binaries.md | 236 +++++++++++++++++++++++++++++++ scripts/make_registry_snippet.py | 102 +++++++++++++ src/raven_python/binaries.py | 148 +++++++++++++++++++ src/raven_python/data.py | 135 ++++++++++++++++++ tests/test_binaries.py | 80 +++++++++++ tests/test_data.py | 89 ++++++++++++ tests/test_scripts_registry.py | 58 ++++++++ 7 files changed, 848 insertions(+) create mode 100644 docs/maintaining_binaries.md create mode 100644 scripts/make_registry_snippet.py create mode 100644 src/raven_python/binaries.py create mode 100644 src/raven_python/data.py create mode 100644 tests/test_binaries.py create mode 100644 tests/test_data.py create mode 100644 tests/test_scripts_registry.py diff --git a/docs/maintaining_binaries.md b/docs/maintaining_binaries.md new file mode 100644 index 0000000..df5b315 --- /dev/null +++ b/docs/maintaining_binaries.md @@ -0,0 +1,236 @@ +# Maintaining bundled binaries (BLAST+, DIAMOND, …) + +Audience: **raven-python maintainers / the GitHub repo owner.** This explains how +raven-python ships external command-line tools, how to update their versions, and how +to build **minimal-footprint** ZIPs to attach to a GitHub release. + +> End users never read this. They get a binary automatically via `ensure_binary`, +> or use their own (system/conda) install. This doc is only for whoever publishes +> the release assets. + +--- + +## 1. How binary provisioning works + +raven-python does **not** vendor binaries in the git repo or on PyPI. Instead: + +1. For each tool we publish **version-pinned ZIPs as GitHub release assets**. +2. A **registry** (`src/raven_python/binaries_registry.json`) maps each *bundle* to its + version, the executables it provides, and per-platform `{asset, sha256}`. +3. At run time `raven_python.binaries.ensure_binary("blastp")` resolves a tool in this + order — and only reaches the download as a last resort: + + ``` + explicit binary= arg → env var (RAVEN_PYTHON_BLASTP / RAVEN_PYTHON_DIAMOND / …) + → shutil.which on PATH (system / conda / apt / brew) + → ensure_binary: download the pinned ZIP → verify SHA256 → cache → return path + → actionable error (with conda / manual instructions) + ``` + +So a pre-installed binary always wins; the bundle is the zero-setup fallback. +Pinning the version makes reconstruction **reproducible**. + +A *bundle* can provide several executables from one download (e.g. the `blast` +bundle provides both `blastp` and `makeblastdb`), so they are fetched once. + +--- + +## 2. What raven-python actually needs — ship only these + +Distribute the **minimum** set of executables. Everything else (other suite +tools, docs, examples, changelogs) must be excluded. + +| Bundle | Executables to include | Everything else | +|---|---|---| +| `diamond` | `diamond` | — (it is a single static binary) | +| `blast` | `blastp`, `makeblastdb` | **drop** `blastn`, `tblastn`, `psiblast`, `rpsblast`, `blast_formatter`, `*_vdb`, the `doc/`, `ChangeLog`, `README`, ~30 other tools | + +(Confirmed against RAVEN `getBlast`/`getDiamond`: only `makeblastdb`+`blastp`, and +`diamond` for its `makedb`/`blastp` subcommands, are ever invoked.) + +For BLAST+ this is the big win: the full NCBI suite is ~hundreds of MB; two +binaries (stripped) are a small fraction. + +--- + +## 3. Asset & ZIP conventions + +**Asset filename:** `---.zip` + +- `` ∈ `linux`, `macos`, `windows` +- `` ∈ `x86_64`, `arm64` +- examples: `diamond-2.1.11-linux-x86_64.zip`, `blast-2.16.0-macos-arm64.zip` + +**ZIP layout — flat, executables at the root, plus the upstream licence:** + +``` +diamond-2.1.11-linux-x86_64.zip +├── diamond +└── LICENSE + +blast-2.16.0-linux-x86_64.zip +├── blastp +├── makeblastdb +└── LICENSE +``` + +No nested `bin/`, no extra files. `ensure_binary` extracts the ZIP into the cache +and expects the executable at the top level. + +--- + +## 4. Step-by-step: add or update a version + +Example: bump DIAMOND to a new version for Linux x86-64. Repeat per `(os, arch)`. + +1. **Download the official upstream build** (never rebuild from source unless you + must): + - DIAMOND → + (`diamond-linux64.tar.gz`, `diamond-macos.tar.gz`) + - BLAST+ → or a + pinned version dir (`ncbi-blast-+-x64-linux.tar.gz`, + `-x64-macosx.tar.gz`, `-aarch64-linux.tar.gz`, `-x64-win64.tar.gz`). + - Record the upstream URL **and** its published checksum for provenance. +2. **Extract only the needed executables** (see §2) to a clean staging dir. +3. **Strip debug symbols** to shrink (skip on Windows / signed macOS builds): + ```bash + strip diamond # or: strip blastp makeblastdb + ``` +4. **Smoke-test the stripped binaries in a clean shell** (no other tools on PATH): + ```bash + ./diamond --version + ./blastp -version && ./makeblastdb -version + ``` + If they fail for a missing shared library, add that `.so`/`.dylib` to the ZIP + (rare — NCBI/DIAMOND release builds are largely self-contained). +5. **Add the upstream licence file** as `LICENSE` (see §6). +6. **Zip with max compression, flat layout:** + ```bash + zip -9 -j diamond-2.1.11-linux-x86_64.zip diamond LICENSE + # -j junks paths so entries sit at the ZIP root + ``` +7. **Compute the SHA256:** + ```bash + sha256sum diamond-2.1.11-linux-x86_64.zip # shasum -a 256 on macOS + ``` +8. **Attach the ZIP to a raven-python GitHub release** (a release tagged for the binary + set, e.g. `binaries-2024.06`, keeps them independent of code releases). +9. **Update the registry** `src/raven_python/binaries_registry.json` — bump `version` + and set the per-platform `asset` + `sha256`: + ```json + { + "diamond": { + "version": "2.1.11", + "provides": ["diamond"], + "platforms": { + "linux-x86_64": { + "asset": "diamond-2.1.11-linux-x86_64.zip", + "url": "https://github.com/SysBioChalmers/raven-python/releases/download/binaries-2024.06/diamond-2.1.11-linux-x86_64.zip", + "sha256": "" + } + } + }, + "blast": { + "version": "2.16.0", + "provides": ["blastp", "makeblastdb"], + "platforms": { "linux-x86_64": { "asset": "...", "url": "...", "sha256": "..." } } + } + } + ``` +10. **Commit the registry change**, run the homology tests, and (if you have the + binary) confirm `ensure_binary("diamond", version="2.1.11")` downloads, + verifies, and runs. + +--- + +## 5. Keeping the footprint minimal — checklist + +- ✅ Only the executables in §2 (for BLAST+, exactly `blastp` + `makeblastdb`). +- ✅ `strip` the binaries (often halves their size). +- ✅ `zip -9 -j` (max compression, flat — no `bin/`, no folders). +- ✅ Exactly one extra file: `LICENSE`. +- ❌ No docs, examples, `ChangeLog`, `README`, man pages, test data, or sibling tools. +- ❌ No `.dSYM`/debug bundles; no duplicate static `.a` libraries. +- ➕ Only add a shared library if step-4 testing proves it is required. + +--- + +## 6. Platform / architecture matrix & licensing + +**Coverage = what you build.** Start with `linux-x86_64` (CI default), then add +`macos-arm64`, `macos-x86_64`, `linux-arm64`, `windows-x86_64` as capacity allows. +For any `(os, arch)` **not** in the registry, `ensure_binary` raises an actionable +error pointing to conda (`conda install -c bioconda diamond blast`) or a manual +install — that is the documented fallback, not a failure to fix urgently. + +**Licensing (must comply when redistributing):** + +- **BLAST+** — produced by NCBI (US Government); **public domain**, free to + redistribute. Include NCBI's `LICENSE` for courtesy/provenance. +- **DIAMOND** — **GPLv3**. Redistribution is allowed; you **must** include the + GPLv3 licence text in the ZIP and keep the binary unmodified (or offer source). +- **HMMER** (future) — BSD-3-Clause; include its `LICENSE`. + +Always ship the upstream licence in the ZIP, and keep a `BINARIES_PROVENANCE.md` +(or a note in the release body) recording, per asset: upstream URL, upstream +version, upstream checksum, and the SHA256 you published. + +### Native OS support per tool + +raven-python invokes each tool through `subprocess.run([resolved_path, …])` — that +call is itself cross-platform, so the real constraint is whether a given tool has +a binary that runs natively on each OS. It varies: + +| Tool | Linux | macOS (incl. arm64) | Windows (native) | +|---|---|---|---| +| BLAST+ (`blastp`, `makeblastdb`) | ✅ | ✅ | ✅ (NCBI ships Windows builds) | +| DIAMOND | ✅ | ✅ | ⚠️ native build exists but Linux-first | +| HMMER (`hmmbuild`/`hmmpress`/`hmmsearch`/`hmmscan`) | ✅ | ✅ | ❌ no official native build | +| MAFFT | ✅ | ✅ | ⚠️ Windows package is a wrapper | +| CD-HIT | ✅ | ✅ | ❌ no Windows build exists | + +Implications: + +- **Linux / macOS** — everything works. `conda install -c bioconda hmmer mafft + cd-hit blast diamond`, or point the `RAVEN_PYTHON_*` env vars at your installs. +- **Native Windows** — the homology track (BLAST+/DIAMOND) works, but the **KEGG + HMM build (3b.3) and HMM query (3b.5) do not**: HMMER and CD-HIT have no Windows + binaries, and bioconda has no Windows packages for any of them. Bundling can't + fix this — there is no binary to bundle. +- **Windows users should run raven-python inside WSL2** (or a Linux container), where + every tool is native Linux. raven-python does **not** replicate RAVEN's + `getWSLpath`/`wsl …` path translation: it calls the resolved binary directly, so + mixing native-Windows Python with WSL binaries is unsupported — keep the whole + stack inside WSL2. +- The common end-user paths — homology reconstruction and the KEGG *species* model + (3b.4) — need no HMMER/MAFFT/CD-HIT, so they are fully cross-platform. + +--- + +## 7. Emitting the registry entry + +After building the per-platform ZIPs (named `---.zip`) +and uploading them to the release, generate the `_REGISTRY` entry — checksums and +URLs — with [`scripts/make_registry_snippet.py`](../scripts/README.md): + +```bash +python scripts/make_registry_snippet.py binary --bundle blast --version 2.16.0 \ + --provides blastp makeblastdb --dir zips \ + --base-url https://github.com/ORG/raven-python/releases/download/blast-2.16.0 +``` + +It prints the ready-to-paste `_REGISTRY["blast"]` block; its SHA256 helper is the +same one `ensure_binary` verifies with, so the checksums always match. (Producing +the minimal ZIPs themselves — download upstream, `strip`, `zip -9 -j`, add +`LICENSE` per §3–§6 — is still a manual/per-tool step.) + +--- + +## 8. Adding a new tool later (e.g. HMMER for KEGG reconstruction) + +1. Decide the **minimal executable set** (e.g. HMMER → `hmmsearch`, `hmmscan`, + maybe `hmmbuild`/`hmmpress`). +2. Add a bundle entry to the registry with `provides` listing those executables. +3. Build/attach ZIPs per §3–§4; include the tool's licence (§6). +4. The wrappers call `ensure_binary("hmmsearch", …)` with the same resolution + order — no new provisioning code needed. diff --git a/scripts/make_registry_snippet.py b/scripts/make_registry_snippet.py new file mode 100644 index 0000000..3efa49e --- /dev/null +++ b/scripts/make_registry_snippet.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python +"""Emit ready-to-paste registry entries for published artefacts / binary ZIPs. + +Computes the SHA256 of each file and prints the Python/JSON entry to merge into +``raven_python.data._DATA_REGISTRY`` (data artefacts) or ``raven_python.binaries._REGISTRY`` +(binary bundles). Run once per release, after uploading the files to the release. + +Examples +-------- +Data artefacts (KEGG reference model + tables + HMM libraries) for one release:: + + python scripts/make_registry_snippet.py data \\ + --dataset kegg --version kegg116 --dir artefacts \\ + --base-url https://github.com/ORG/raven_python/releases/download/kegg-data-kegg116 + +Binary bundle (one ZIP per platform, named ``---.zip``):: + + python scripts/make_registry_snippet.py binary \\ + --bundle blast --version 2.16.0 --provides blastp makeblastdb --dir zips \\ + --base-url https://github.com/ORG/raven_python/releases/download/blast-2.16.0 + +The SHA256 helper is shared with the runtime resolvers (``raven_python.binaries``), so +published checksums always match what ``ensure_data`` / ``ensure_binary`` verify. +""" +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +from raven_python.binaries import _sha256 + + +def _files_in(directory: Path) -> list[Path]: + """Regular, non-hidden files in ``directory``, sorted by name.""" + return sorted(p for p in directory.iterdir() if p.is_file() and not p.name.startswith(".")) + + +def data_entry(dataset: str, version: str, base_url: str, directory: Path) -> dict: + """Build the ``_DATA_REGISTRY[dataset]`` entry for every file in ``directory``.""" + base = base_url.rstrip("/") + files = { + p.name: {"url": f"{base}/{p.name}", "sha256": _sha256(p)} for p in _files_in(directory) + } + if not files: + raise SystemExit(f"No files found in {directory}") + return {"version": version, "files": files} + + +def binary_entry( + bundle: str, version: str, provides: list[str], base_url: str, directory: Path +) -> dict: + """Build the ``_REGISTRY[bundle]`` entry from ``---.zip``.""" + base = base_url.rstrip("/") + prefix = f"{bundle}-{version}-" + platforms = {} + for zip_path in directory.glob(f"{prefix}*.zip"): + platform = zip_path.name[len(prefix) : -len(".zip")] + platforms[platform] = {"url": f"{base}/{zip_path.name}", "sha256": _sha256(zip_path)} + if not platforms: + raise SystemExit(f"No {prefix}*.zip files found in {directory}") + return {"version": version, "provides": provides, "platforms": dict(sorted(platforms.items()))} + + +def render(key: str, entry: dict) -> str: + """Render ``{key: entry}`` as an indented JSON block (valid Python to paste).""" + return json.dumps({key: entry}, indent=4) + + +def main(argv: list[str] | None = None) -> None: + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + sub = parser.add_subparsers(dest="kind", required=True) + + d = sub.add_parser("data", help="data-artefact registry entry (raven_python.data)") + d.add_argument("--dataset", required=True, help="dataset key, e.g. 'kegg'") + d.add_argument("--version", required=True) + d.add_argument("--dir", required=True, type=Path, help="directory of uploaded artefacts") + d.add_argument("--base-url", required=True, help="release download URL prefix") + + b = sub.add_parser("binary", help="binary-bundle registry entry (raven_python.binaries)") + b.add_argument("--bundle", required=True, help="bundle key, e.g. 'blast'") + b.add_argument("--version", required=True) + b.add_argument("--provides", nargs="+", required=True, help="executables the bundle provides") + b.add_argument("--dir", required=True, type=Path, help="directory of uploaded ZIPs") + b.add_argument("--base-url", required=True, help="release download URL prefix") + + args = parser.parse_args(argv) + if args.kind == "data": + key, entry = args.dataset, data_entry(args.dataset, args.version, args.base_url, args.dir) + target = "raven_python/data.py _DATA_REGISTRY" + else: + key = args.bundle + entry = binary_entry(args.bundle, args.version, args.provides, args.base_url, args.dir) + target = "raven_python/binaries.py _REGISTRY" + + print(f"# Merge into {target}:", file=sys.stderr) + print(render(key, entry)) + + +if __name__ == "__main__": + main() diff --git a/src/raven_python/binaries.py b/src/raven_python/binaries.py new file mode 100644 index 0000000..2d4b5a2 --- /dev/null +++ b/src/raven_python/binaries.py @@ -0,0 +1,148 @@ +"""Locate and provision external command-line binaries (BLAST+, DIAMOND, …). + +Shared across tools (not homology-specific). Resolution order for any executable: + + explicit path arg → env var (RAVEN_PYTHON_) → shutil.which (PATH) + → ensure_binary (download the version-pinned ZIP from a raven_python release, + verify SHA256, cache, return the path) + → FileNotFoundError with install guidance + +So a pre-installed/conda binary always wins; the bundled ZIP is the zero-setup +fallback. See docs/maintaining_binaries.md for how the release ZIPs and the +registry are produced and updated. +""" +from __future__ import annotations + +import hashlib +import os +import platform +import shutil +import zipfile +from pathlib import Path +from urllib.request import urlopen + +# Registry of bundled binaries. Empty until release ZIPs are published; populated +# per docs/maintaining_binaries.md. Keyed by *bundle*; one bundle can provide +# several executables (e.g. "blast" -> blastp + makeblastdb). +# bundle -> {version, provides:[exe...], platforms:{"-": {url, sha256}}} +_REGISTRY: dict = {} + +# Environment variable overrides per executable. +_ENV_VARS = { + "diamond": "RAVEN_PYTHON_DIAMOND", + "blastp": "RAVEN_PYTHON_BLASTP", + "makeblastdb": "RAVEN_PYTHON_MAKEBLASTDB", + "hmmbuild": "RAVEN_PYTHON_HMMBUILD", + "hmmpress": "RAVEN_PYTHON_HMMPRESS", + "hmmsearch": "RAVEN_PYTHON_HMMSEARCH", + "hmmscan": "RAVEN_PYTHON_HMMSCAN", + "mafft": "RAVEN_PYTHON_MAFFT", + "cd-hit": "RAVEN_PYTHON_CDHIT", +} + + +def platform_key() -> str: + """Return the ``-`` key used in the registry (e.g. ``linux-x86_64``).""" + system = {"linux": "linux", "darwin": "macos", "windows": "windows"}.get( + platform.system().lower(), platform.system().lower() + ) + machine = platform.machine().lower() + arch = {"x86_64": "x86_64", "amd64": "x86_64", "arm64": "arm64", "aarch64": "arm64"}.get( + machine, machine + ) + return f"{system}-{arch}" + + +def _cache_dir() -> Path: + base = os.environ.get("XDG_CACHE_HOME") or (Path.home() / ".cache") + return Path(base) / "raven_python" / "binaries" + + +def _bundle_for(executable: str, registry: dict): + for name, bundle in registry.items(): + if executable in bundle.get("provides", []): + return name, bundle + return None, None + + +def _sha256(path: Path) -> str: + h = hashlib.sha256() + with open(path, "rb") as fh: + for chunk in iter(lambda: fh.read(1 << 20), b""): + h.update(chunk) + return h.hexdigest() + + +def ensure_binary(executable: str, *, registry: dict | None = None) -> Path: + """Download (if needed) and return the path to a bundled ``executable``. + + Consults the registry for the current platform, downloads the pinned ZIP, + verifies its SHA256, extracts it into the cache, and returns the executable + path. Raises ``FileNotFoundError`` if no bundle for this platform is hosted. + """ + registry = _REGISTRY if registry is None else registry + bundle_name, bundle = _bundle_for(executable, registry) + if bundle is None: + raise FileNotFoundError( + f"No bundled binary registered for {executable!r}. Install it (e.g. " + f"`conda install -c bioconda {executable}`) or pass an explicit path." + ) + key = platform_key() + entry = bundle.get("platforms", {}).get(key) + if entry is None: + raise FileNotFoundError( + f"No bundled {executable!r} for platform {key!r}. Install it " + f"(e.g. `conda install -c bioconda {executable}`), set " + f"{_ENV_VARS.get(executable, 'the binary path')}, or pass binary=." + ) + + dest_dir = _cache_dir() / f"{bundle_name}-{bundle['version']}-{key}" + exe = dest_dir / executable + if exe.exists(): + return exe + + dest_dir.mkdir(parents=True, exist_ok=True) + archive = dest_dir / "_download.zip" + # Download into a sibling .part file and rename on success — an interrupted + # download leaves the partial behind .part, never as a half-complete .zip + # that a later run might mistake for a finished one. Mirrors data.py. + part = archive.with_suffix(archive.suffix + ".part") + try: + with urlopen(entry["url"]) as resp, open(part, "wb") as out: # noqa: S310 + shutil.copyfileobj(resp, out) + digest = _sha256(part) + if digest != entry["sha256"]: + raise ValueError( + f"SHA256 mismatch for {executable!r} ({key}): " + f"expected {entry['sha256']}, got {digest}." + ) + os.replace(part, archive) + finally: + part.unlink(missing_ok=True) + with zipfile.ZipFile(archive) as zf: + zf.extractall(dest_dir) + archive.unlink(missing_ok=True) + if not exe.exists(): + raise FileNotFoundError(f"{executable!r} not found in the extracted bundle at {dest_dir}.") + exe.chmod(0o755) + return exe + + +def resolve_binary(executable: str, *, binary: str | os.PathLike | None = None) -> str: + """Resolve an executable to a path: arg → env var → PATH → bundled ZIP → error.""" + if binary is not None: + return os.fspath(binary) + env_var = _ENV_VARS.get(executable) + if env_var and os.environ.get(env_var): + return os.environ[env_var] + found = shutil.which(executable) + if found: + return found + try: + return os.fspath(ensure_binary(executable)) + except FileNotFoundError as exc: + raise FileNotFoundError( + f"Could not find {executable!r}. Install it (e.g. " + f"`conda install -c bioconda {executable}`), put it on PATH, set " + f"{env_var or 'the binary path'}, or pass binary=. ({exc})" + ) from exc diff --git a/src/raven_python/data.py b/src/raven_python/data.py new file mode 100644 index 0000000..b1264be --- /dev/null +++ b/src/raven_python/data.py @@ -0,0 +1,135 @@ +"""Fetch and cache published data artefacts (KEGG reference model, tables, HMMs). + +The mirror of :mod:`raven_python.binaries` for *data*: a version-pinned registry of +downloadable artefacts, fetched on first use, SHA256-verified, and cached under +platformdirs so end users never rebuild them from a KEGG dump (that is the +maintainer's job — see docs/maintaining_kegg_data.md). + +Resolution for any artefact file: + + explicit local dir → cached copy → download from the registry (verify, + cache) → FileNotFoundError with guidance + +The registry is **empty until the artefacts are published** (same as +``binaries._REGISTRY``); until then ``ensure_data_file`` raises an actionable +error. Cache layout:: + + $XDG_CACHE_HOME/raven_python/data/-/ + (or ~/.cache/raven_python/data/... if XDG_CACHE_HOME is unset) +""" +from __future__ import annotations + +import os +import shutil +from pathlib import Path +from urllib.request import urlopen + +from raven_python.binaries import _sha256 + +# dataset -> {"version": str, "files": {filename: {"url": str, "sha256": str}}} +# Populated when raven_python publishes the KEGG artefacts as release assets. +_DATA_REGISTRY: dict = {} + +# The core KEGG artefacts needed to build a model (no HMM libraries). +CORE_KEGG_FILES = ( + "reference_model.yml.gz", + "ko_reaction.tsv.gz", + "ko_names.tsv.gz", + "organism_gene_ko.tsv.xz", + "rxn_flags.tsv.gz", +) + + +def _data_cache_dir() -> Path: + base = os.environ.get("XDG_CACHE_HOME") or (Path.home() / ".cache") + return Path(base) / "raven_python" / "data" + + +def _bundle(dataset: str, registry: dict) -> dict: + bundle = registry.get(dataset) + if bundle is None: + raise FileNotFoundError( + f"No data artefacts registered for {dataset!r}. Either pass a local " + f"directory of artefacts, or build them per docs/maintaining_kegg_data.md." + ) + return bundle + + +def ensure_data_file( + dataset: str, + filename: str, + *, + version: str | None = None, + registry: dict | None = None, +) -> Path: + """Download (if needed) and return the cached path to one artefact file. + + Looks the file up in the registry for ``dataset`` (at ``version`` or the + registry's default), downloads it to the version-pinned cache directory, + verifies its SHA256, and returns the path. Re-uses an already-cached copy. + """ + registry = _DATA_REGISTRY if registry is None else registry + bundle = _bundle(dataset, registry) + ver = version or bundle["version"] + entry = bundle.get("files", {}).get(filename) + if entry is None: + raise FileNotFoundError( + f"{filename!r} is not registered for {dataset!r} {ver}. " + f"Available: {sorted(bundle.get('files', {}))}." + ) + + dest_dir = _data_cache_dir() / f"{dataset}-{ver}" + dest = dest_dir / filename + if dest.exists(): + return dest + + dest_dir.mkdir(parents=True, exist_ok=True) + tmp = dest.with_name(dest.name + ".part") + with urlopen(entry["url"]) as resp, open(tmp, "wb") as out: # noqa: S310 (trusted registry URLs) + shutil.copyfileobj(resp, out) + digest = _sha256(tmp) + if digest != entry["sha256"]: + tmp.unlink(missing_ok=True) + raise ValueError( + f"SHA256 mismatch for {dataset}/{filename} ({ver}): " + f"expected {entry['sha256']}, got {digest}." + ) + tmp.replace(dest) + return dest + + +def ensure_kegg_data( + *, + version: str | None = None, + files: tuple[str, ...] = CORE_KEGG_FILES, + registry: dict | None = None, +) -> Path: + """Ensure the core KEGG artefacts are cached; return their directory. + + Fetches each of ``files`` (default :data:`CORE_KEGG_FILES`) for the ``kegg`` + dataset and returns the cache directory holding them — ready to pass as the + ``artefact_dir`` of :func:`get_kegg_model_for_organism_from_artefacts`. + """ + registry = _DATA_REGISTRY if registry is None else registry + ver = version or _bundle("kegg", registry)["version"] + for filename in files: + ensure_data_file("kegg", filename, version=ver, registry=registry) + return _data_cache_dir() / f"kegg-{ver}" + + +def ensure_kegg_hmm_library( + domain: str, *, version: str | None = None, registry: dict | None = None +) -> Path: + """Ensure a domain HMM library (and its hmmpress index) is cached; return its path. + + ``domain`` is ``"prokaryotes"`` or ``"eukaryotes"``. Fetches ``.hmm`` + plus the ``hmmpress`` sidecar files (``.h3f/.h3i/.h3m/.h3p``) and returns the + path to the ``.hmm`` (the argument for :func:`run_hmmscan`). + """ + registry = _DATA_REGISTRY if registry is None else registry + ver = version or _bundle("kegg", registry)["version"] + base = f"{domain}.hmm" + library = ensure_data_file("kegg", base, version=ver, registry=registry) + for suffix in (".h3f", ".h3i", ".h3m", ".h3p"): + ensure_data_file("kegg", base + suffix, version=ver, registry=registry) + return library diff --git a/tests/test_binaries.py b/tests/test_binaries.py new file mode 100644 index 0000000..d74ce0b --- /dev/null +++ b/tests/test_binaries.py @@ -0,0 +1,80 @@ +"""Tests for raven_python.binaries (binary resolution + bundled-ZIP provisioning).""" +import hashlib +import shutil +import zipfile +from pathlib import Path + +import pytest + +from raven_python import binaries + + +def test_resolve_explicit_path(): + assert binaries.resolve_binary("blastp", binary="/opt/x/blastp") == "/opt/x/blastp" + + +def test_resolve_env_var(monkeypatch): + monkeypatch.setenv("RAVEN_PYTHON_DIAMOND", "/custom/diamond") + assert binaries.resolve_binary("diamond") == "/custom/diamond" + + +@pytest.mark.skipif(not shutil.which("blastp"), reason="blastp not installed") +def test_resolve_via_path(): + assert binaries.resolve_binary("blastp") == shutil.which("blastp") + + +def test_resolve_unresolvable_raises(monkeypatch): + monkeypatch.setattr(shutil, "which", lambda _: None) + with pytest.raises(FileNotFoundError, match="Could not find"): + binaries.resolve_binary("diamond") # empty registry, not on PATH + + +def test_platform_key_format(): + key = binaries.platform_key() + assert "-" in key + os_part, arch = key.split("-", 1) + assert os_part in {"linux", "macos", "windows"} or os_part # tolerant + + +def test_ensure_binary_downloads_verifies_extracts(tmp_path, monkeypatch): + # Build a fake bundle ZIP containing an executable, served via file:// URL. + exe = tmp_path / "footool" + exe.write_text("#!/bin/sh\necho hi\n") + archive = tmp_path / "footool.zip" + with zipfile.ZipFile(archive, "w") as zf: + zf.write(exe, "footool") + sha = hashlib.sha256(archive.read_bytes()).hexdigest() + + registry = { + "footool": { + "version": "1.0", + "provides": ["footool"], + "platforms": {binaries.platform_key(): {"url": archive.as_uri(), "sha256": sha}}, + } + } + monkeypatch.setenv("XDG_CACHE_HOME", str(tmp_path / "cache")) + + path = binaries.ensure_binary("footool", registry=registry) + assert Path(path).exists() + assert Path(path).name == "footool" + # cached on second call (same path, no re-download needed) + assert binaries.ensure_binary("footool", registry=registry) == path + + +def test_ensure_binary_sha_mismatch(tmp_path, monkeypatch): + archive = tmp_path / "x.zip" + with zipfile.ZipFile(archive, "w") as zf: + zf.writestr("footool", "data") + registry = { + "footool": {"version": "1", "provides": ["footool"], + "platforms": {binaries.platform_key(): {"url": archive.as_uri(), "sha256": "deadbeef"}}} + } + monkeypatch.setenv("XDG_CACHE_HOME", str(tmp_path / "cache")) + with pytest.raises(ValueError, match="SHA256 mismatch"): + binaries.ensure_binary("footool", registry=registry) + + +def test_ensure_binary_unhosted_platform_raises(tmp_path): + registry = {"footool": {"version": "1", "provides": ["footool"], "platforms": {}}} + with pytest.raises(FileNotFoundError, match="No bundled"): + binaries.ensure_binary("footool", registry=registry) diff --git a/tests/test_data.py b/tests/test_data.py new file mode 100644 index 0000000..714c3a9 --- /dev/null +++ b/tests/test_data.py @@ -0,0 +1,89 @@ +"""Tests for ensure_data (data.py). Uses file:// URLs to avoid the network.""" +import hashlib + +import pytest + +from raven_python.data import ( + CORE_KEGG_FILES, + ensure_data_file, + ensure_kegg_data, +) + + +def _sha256(data: bytes) -> str: + return hashlib.sha256(data).hexdigest() + + +@pytest.fixture +def served(tmp_path, monkeypatch): + """A fake registry served from local files, with the cache pointed at tmp.""" + src = tmp_path / "src" + src.mkdir() + payloads = { + "reference_model.yml.gz": b"!!omap model bytes", + "ko_reaction.tsv.gz": b"ko\treaction\n", + "ko_names.tsv.gz": b"ko\tname\n", + "organism_gene_ko.tsv.xz": b"organism\tgene\tko\n", + "rxn_flags.tsv.gz": b"reaction\tspontaneous\n", + } + files = {} + for name, data in payloads.items(): + path = src / name + path.write_bytes(data) + files[name] = {"url": path.as_uri(), "sha256": _sha256(data)} + registry = {"kegg": {"version": "v1", "files": files}} + + cache = tmp_path / "cache" + monkeypatch.setenv("XDG_CACHE_HOME", str(cache)) + return registry, cache, payloads + + +def test_ensure_data_file_downloads_and_caches(served): + registry, cache, payloads = served + path = ensure_data_file("kegg", "ko_reaction.tsv.gz", registry=registry) + assert path == cache / "raven_python" / "data" / "kegg-v1" / "ko_reaction.tsv.gz" + assert path.read_bytes() == payloads["ko_reaction.tsv.gz"] + + +def test_ensure_data_file_reuses_cache(served, monkeypatch): + registry, _, _ = served + first = ensure_data_file("kegg", "ko_names.tsv.gz", registry=registry) + # Break the URL: a second call must hit the cache, not re-download. + registry["kegg"]["files"]["ko_names.tsv.gz"]["url"] = "file:///nonexistent" + second = ensure_data_file("kegg", "ko_names.tsv.gz", registry=registry) + assert first == second and second.exists() + + +def test_sha256_mismatch_rejected(served): + registry, cache, _ = served + registry["kegg"]["files"]["rxn_flags.tsv.gz"]["sha256"] = "0" * 64 + with pytest.raises(ValueError, match="SHA256 mismatch"): + ensure_data_file("kegg", "rxn_flags.tsv.gz", registry=registry) + # The corrupt partial download must not be left behind. + assert not (cache / "raven_python" / "data" / "kegg-v1" / "rxn_flags.tsv.gz").exists() + + +def test_unknown_dataset_actionable_error(served): + registry, _, _ = served + with pytest.raises(FileNotFoundError, match="No data artefacts registered"): + ensure_data_file("metacyc", "x", registry=registry) + + +def test_unknown_file_lists_available(served): + registry, _, _ = served + with pytest.raises(FileNotFoundError, match="not registered"): + ensure_data_file("kegg", "missing.tsv.gz", registry=registry) + + +def test_ensure_kegg_data_fetches_core_set(served): + registry, cache, _ = served + out = ensure_kegg_data(registry=registry) + assert out == cache / "raven_python" / "data" / "kegg-v1" + for name in CORE_KEGG_FILES: + assert (out / name).is_file() + + +def test_empty_registry_raises(): + # The shipped registry is empty until artefacts are published. + with pytest.raises(FileNotFoundError, match="No data artefacts registered"): + ensure_data_file("kegg", "ko_reaction.tsv.gz") diff --git a/tests/test_scripts_registry.py b/tests/test_scripts_registry.py new file mode 100644 index 0000000..c9c03cf --- /dev/null +++ b/tests/test_scripts_registry.py @@ -0,0 +1,58 @@ +"""Tests for scripts/make_registry_snippet.py registry-entry helpers.""" +import hashlib +import importlib.util +import json +from pathlib import Path + +import pytest + +# scripts/ is not a package; load the module directly by path. +_SCRIPT = Path(__file__).resolve().parents[1] / "scripts" / "make_registry_snippet.py" +_spec = importlib.util.spec_from_file_location("make_registry_snippet", _SCRIPT) +mrs = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(mrs) + + +def _sha(data: bytes) -> str: + return hashlib.sha256(data).hexdigest() + + +def test_data_entry_lists_files_with_urls_and_checksums(tmp_path): + (tmp_path / "reference_model.yml.gz").write_bytes(b"model") + (tmp_path / "ko_reaction.tsv.gz").write_bytes(b"table") + (tmp_path / ".hidden").write_bytes(b"skip") # hidden files ignored + + entry = mrs.data_entry("kegg", "kegg116", "https://x/rel/", tmp_path) + assert entry["version"] == "kegg116" + assert set(entry["files"]) == {"reference_model.yml.gz", "ko_reaction.tsv.gz"} + ref = entry["files"]["reference_model.yml.gz"] + assert ref["url"] == "https://x/rel/reference_model.yml.gz" # trailing slash collapsed + assert ref["sha256"] == _sha(b"model") + + +def test_data_entry_empty_dir_errors(tmp_path): + with pytest.raises(SystemExit): + mrs.data_entry("kegg", "v1", "https://x", tmp_path) + + +def test_binary_entry_parses_platform_from_filename(tmp_path): + (tmp_path / "blast-2.16.0-linux-x86_64.zip").write_bytes(b"linux") + (tmp_path / "blast-2.16.0-macos-arm64.zip").write_bytes(b"mac") + (tmp_path / "other-1.0-linux-x86_64.zip").write_bytes(b"nope") # different bundle + + entry = mrs.binary_entry("blast", "2.16.0", ["blastp", "makeblastdb"], "https://x", tmp_path) + assert entry["provides"] == ["blastp", "makeblastdb"] + assert set(entry["platforms"]) == {"linux-x86_64", "macos-arm64"} + assert entry["platforms"]["macos-arm64"]["sha256"] == _sha(b"mac") + assert entry["platforms"]["linux-x86_64"]["url"].endswith("blast-2.16.0-linux-x86_64.zip") + + +def test_binary_entry_no_zips_errors(tmp_path): + with pytest.raises(SystemExit): + mrs.binary_entry("blast", "2.16.0", ["blastp"], "https://x", tmp_path) + + +def test_render_is_valid_json_round_trip(): + entry = {"version": "v1", "files": {"a": {"url": "u", "sha256": "s"}}} + text = mrs.render("kegg", entry) + assert json.loads(text) == {"kegg": entry}