Skip to content

Commit 50bea40

Browse files
committed
Add the foundation utilities: GPR, balance, parse, sort, validate
1 parent b7b69ac commit 50bea40

10 files changed

Lines changed: 646 additions & 0 deletions

File tree

src/raven_python/utils/__init__.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
"""Shared helpers — GPR linting, elemental balance, model curation checks, id sorting."""
2+
from raven_python.utils.balance import ElementalBalance, get_elemental_balance
3+
from raven_python.utils.gpr import GPRIssue, find_non_dnf_grrules, is_dnf
4+
from raven_python.utils.sort import sort_identifiers
5+
from raven_python.utils.validate import ModelIssue, check_model
6+
7+
__all__ = [
8+
"ElementalBalance",
9+
"GPRIssue",
10+
"ModelIssue",
11+
"check_model",
12+
"find_non_dnf_grrules",
13+
"get_elemental_balance",
14+
"is_dnf",
15+
"sort_identifiers",
16+
]

src/raven_python/utils/balance.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
"""Check the elemental balance of reactions, distinguishing *unbalanced* from
2+
*unknown* (missing formula).
3+
4+
cobra's ``reaction.check_mass_balance()`` silently treats a missing formula as
5+
empty, so a reaction can look "unbalanced" — or even balanced — when the truth is
6+
that the data is incomplete. This module checks for missing formulas first and
7+
returns a graded status
8+
per reaction (``balanced`` / ``unbalanced`` / ``unknown``) plus the element
9+
imbalance — over a batch, as structured data.
10+
"""
11+
from __future__ import annotations
12+
13+
from dataclasses import dataclass, field
14+
15+
import cobra
16+
17+
18+
@dataclass(frozen=True)
19+
class ElementalBalance:
20+
"""Balance result for one reaction.
21+
22+
Attributes
23+
----------
24+
reaction_id
25+
ID of the reaction.
26+
status
27+
``"balanced"`` — elements balance;
28+
``"unbalanced"`` — they do not (see ``imbalance``);
29+
``"unknown"`` — at least one metabolite has no formula, so it cannot be
30+
determined (cobra would silently miscount these).
31+
imbalance
32+
Element → net coefficient (products − reactants), only for
33+
``"unbalanced"``; empty otherwise. Charge is not included.
34+
"""
35+
36+
reaction_id: str
37+
status: str
38+
imbalance: dict[str, float] = field(default_factory=dict)
39+
40+
41+
def get_elemental_balance(
42+
model: cobra.Model, reactions=None
43+
) -> list[ElementalBalance]:
44+
"""Check whether reactions are elementally balanced.
45+
Parameters
46+
----------
47+
reactions
48+
Reaction IDs/objects to check; default all reactions. (Boundary
49+
reactions exchange mass with the environment and will read as
50+
``unbalanced`` — filter them out if that is not wanted.)
51+
52+
Returns
53+
-------
54+
list of ElementalBalance
55+
One entry per checked reaction, in model order.
56+
"""
57+
if reactions is None:
58+
rxns = list(model.reactions)
59+
else:
60+
if isinstance(reactions, (str, cobra.Reaction)):
61+
reactions = [reactions]
62+
rxns = [
63+
r if isinstance(r, cobra.Reaction) else model.reactions.get_by_id(r)
64+
for r in reactions
65+
]
66+
67+
results: list[ElementalBalance] = []
68+
for rxn in rxns:
69+
if not rxn.metabolites:
70+
# A reaction with no metabolites used to fall through to ``balanced``
71+
# (vacuously) because ``any()`` over the empty list is False and the
72+
# zero-element imbalance dict is empty. Treat the no-formula case
73+
# (zero formulae present) as ``unknown``: we can't determine balance
74+
# for a reaction without stoichiometry.
75+
results.append(ElementalBalance(rxn.id, "unknown"))
76+
continue
77+
if any(not met.formula for met in rxn.metabolites):
78+
results.append(ElementalBalance(rxn.id, "unknown"))
79+
continue
80+
imbalance = {
81+
element: amount
82+
for element, amount in rxn.check_mass_balance().items()
83+
if element != "charge"
84+
}
85+
if imbalance:
86+
results.append(ElementalBalance(rxn.id, "unbalanced", imbalance))
87+
else:
88+
results.append(ElementalBalance(rxn.id, "balanced"))
89+
return results

src/raven_python/utils/gpr.py

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
"""GPR (gene-protein-reaction rule) linting.
2+
3+
Flag GPRs that are *not* in disjunctive normal form ("OR of AND-complexes"), via cobra's
4+
GPR AST. GPR syntax *normalisation* is already done by cobra on assignment, so it isn't
5+
re-implemented here.
6+
7+
Part (2) has no cobrapy equivalent and is ported here, reworked onto cobra's
8+
GPR AST instead of RAVEN's brittle substring search. The relevant property is
9+
**disjunctive normal form (DNF)**: an OR of AND-clauses of single genes, e.g.
10+
``(G1 and G2) or G3``. Rules where an AND contains an OR — e.g.
11+
``(G1 or G2) and (G3 or G4)`` — are *valid* for cobra but ambiguous for the
12+
isoenzyme/complex reasoning used across RAVEN/GECKO, and ``expand_model``
13+
(see :mod:`raven_python.manipulation.expand`) only does something for DNF rules.
14+
:func:`find_non_dnf_grrules` surfaces them as structured data rather than, as
15+
RAVEN did, only printing a warning.
16+
"""
17+
from __future__ import annotations
18+
19+
import ast
20+
from dataclasses import dataclass
21+
22+
import cobra
23+
from cobra.core.gene import GPR
24+
25+
26+
def _contains_or(node: ast.AST | None) -> bool:
27+
"""True if ``node``'s subtree contains an OR operator anywhere."""
28+
if isinstance(node, ast.BoolOp):
29+
if isinstance(node.op, ast.Or):
30+
return True
31+
return any(_contains_or(value) for value in node.values)
32+
return False
33+
34+
35+
def _is_dnf_node(node: ast.AST | None) -> bool:
36+
"""True if the AST rooted at ``node`` is in disjunctive normal form.
37+
38+
DNF here means no AND operator has an OR anywhere beneath it, i.e. the
39+
rule is a single gene, a pure AND-complex, or an OR of those.
40+
"""
41+
if node is None or isinstance(node, ast.Name):
42+
return True
43+
if isinstance(node, ast.BoolOp):
44+
if isinstance(node.op, ast.And):
45+
return not any(_contains_or(value) for value in node.values)
46+
# OR: every disjunct must itself be DNF
47+
return all(_is_dnf_node(value) for value in node.values)
48+
# Unknown node type: don't flag it as a problem.
49+
return True
50+
51+
52+
def is_dnf(gpr: GPR | str | None) -> bool:
53+
"""Return whether a GPR is in disjunctive normal form (OR of AND-complexes).
54+
55+
Parameters
56+
----------
57+
gpr
58+
A cobra :class:`~cobra.core.gene.GPR`, a grRule string, or ``None``.
59+
An empty/``None`` rule is trivially DNF.
60+
61+
Examples
62+
--------
63+
>>> is_dnf("(G1 and G2) or G3")
64+
True
65+
>>> is_dnf("(G1 or G2) and G3")
66+
False
67+
"""
68+
if isinstance(gpr, str):
69+
gpr = GPR.from_string(gpr)
70+
if gpr is None:
71+
return True
72+
return _is_dnf_node(gpr.body)
73+
74+
75+
@dataclass(frozen=True)
76+
class GPRIssue:
77+
"""A reaction whose GPR is flagged by the linter.
78+
79+
Attributes
80+
----------
81+
reaction_id
82+
ID of the reaction.
83+
gpr
84+
The (already cobra-normalised) grRule string.
85+
reason
86+
Human-readable explanation of why it was flagged.
87+
"""
88+
89+
reaction_id: str
90+
gpr: str
91+
reason: str
92+
93+
94+
_NON_DNF_REASON = (
95+
"GPR is not in disjunctive normal form (an AND clause contains an OR). "
96+
"Isoenzyme/complex reasoning and expand_model assume an OR of AND-complexes, "
97+
'e.g. rewrite "(G1 or G2) and (G3 or G4)" as '
98+
'"(G1 and G3) or (G1 and G4) or (G2 and G3) or (G2 and G4)".'
99+
)
100+
101+
102+
def find_non_dnf_grrules(model: cobra.Model) -> list[GPRIssue]:
103+
"""Find reactions whose GPR is not in disjunctive normal form ("OR of AND-complexes").
104+
105+
Uses cobra's GPR AST. Reactions with no GPR are skipped.
106+
107+
Returns
108+
-------
109+
list of GPRIssue
110+
One entry per flagged reaction, in model reaction order. Empty if all
111+
GPRs are simple OR-of-AND-complexes.
112+
"""
113+
issues: list[GPRIssue] = []
114+
for rxn in model.reactions:
115+
if not rxn.gene_reaction_rule:
116+
continue
117+
if not is_dnf(rxn.gpr):
118+
issues.append(GPRIssue(rxn.id, rxn.gene_reaction_rule, _NON_DNF_REASON))
119+
return issues

src/raven_python/utils/parse.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
"""Small parsing helpers shared across raven_python."""
2+
from __future__ import annotations
3+
4+
import re
5+
6+
# A metabolite written as ``name[comp]``. The name is greedy so that, for a
7+
# pathological name that itself contains brackets, the *last* ``[...]`` is taken
8+
# as the compartment (matching RAVEN getIndexes' ``max(strfind('['))`` rule).
9+
_NAME_COMP_RE = re.compile(r"^(?P<name>.+)\[(?P<comp>[^\[\]]+)\]$")
10+
11+
12+
def parse_name_comp(token: str) -> tuple[str, str | None]:
13+
"""Split a ``name[comp]`` token into ``(name, compartment)``.
14+
15+
This is the one genuinely cobra-absent sliver of RAVEN ``getIndexes``'
16+
``metcomps`` mode and ``addRxns`` eqnType 3: resolving a metabolite written
17+
as its *name* plus a compartment in square brackets, e.g. ``"ATP[c]"``.
18+
19+
Returns ``(name, None)`` when there is no trailing ``[...]``.
20+
21+
Examples
22+
--------
23+
>>> parse_name_comp("ATP[c]")
24+
('ATP', 'c')
25+
>>> parse_name_comp("ATP")
26+
('ATP', None)
27+
>>> parse_name_comp("weird[name][m]")
28+
('weird[name]', 'm')
29+
"""
30+
match = _NAME_COMP_RE.match(token.strip())
31+
if match:
32+
return match.group("name").strip(), match.group("comp").strip()
33+
return token.strip(), None

src/raven_python/utils/sort.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
"""Sort a model's identifiers alphabetically — useful for deterministic,
2+
diff-friendly output.
3+
4+
cobra's ``DictList.sort`` reorders one list (and rebuilds its lookup index), but
5+
there is no single "sort the whole model" call; this provides it.
6+
"""
7+
from __future__ import annotations
8+
9+
import cobra
10+
11+
12+
def sort_identifiers(model: cobra.Model) -> cobra.Model:
13+
"""Sort reactions, metabolites and genes alphabetically by ID, in place.
14+
15+
Returns the same (mutated) model for convenience. Compartments are a plain
16+
dict and are emitted sorted by writers as needed.
17+
"""
18+
model.reactions.sort(key=lambda r: r.id)
19+
model.metabolites.sort(key=lambda m: m.id)
20+
model.genes.sort(key=lambda g: g.id)
21+
return model

src/raven_python/utils/validate.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
"""Curation checks for a model.
2+
3+
A QC bundle cobra has no single call for: orphaned objects, empty reactions,
4+
duplicated metabolite ``name + compartment``, empty names, and objective sanity.
5+
:func:`check_model` returns these as structured :class:`ModelIssue` records.
6+
"""
7+
from __future__ import annotations
8+
9+
from dataclasses import dataclass
10+
11+
import cobra
12+
13+
14+
@dataclass(frozen=True)
15+
class ModelIssue:
16+
"""One curation issue found in a model.
17+
18+
Attributes
19+
----------
20+
category
21+
Machine-readable kind, e.g. ``"orphan_metabolite"``, ``"empty_reaction"``,
22+
``"orphan_gene"``, ``"duplicate_name_compartment"``,
23+
``"empty_metabolite_name"``, ``"objective"``.
24+
object_id
25+
ID of the offending object, or ``None`` for model-level issues.
26+
message
27+
Human-readable description.
28+
"""
29+
30+
category: str
31+
object_id: str | None
32+
message: str
33+
34+
35+
def check_model(model: cobra.Model) -> list[ModelIssue]:
36+
"""Run curation checks on a model and return the issues found.
37+
38+
Does not
39+
raise; returns a (possibly empty) list of :class:`ModelIssue`.
40+
"""
41+
issues: list[ModelIssue] = []
42+
43+
for met in model.metabolites:
44+
if not met.reactions:
45+
issues.append(
46+
ModelIssue("orphan_metabolite", met.id, f"Metabolite {met.id!r} is not used in any reaction.")
47+
)
48+
if not (met.name and str(met.name).strip()):
49+
issues.append(
50+
ModelIssue("empty_metabolite_name", met.id, f"Metabolite {met.id!r} has no name.")
51+
)
52+
53+
for gene in model.genes:
54+
if not gene.reactions:
55+
issues.append(
56+
ModelIssue("orphan_gene", gene.id, f"Gene {gene.id!r} is not associated with any reaction.")
57+
)
58+
59+
for rxn in model.reactions:
60+
if not rxn.metabolites:
61+
issues.append(
62+
ModelIssue("empty_reaction", rxn.id, f"Reaction {rxn.id!r} has no metabolites.")
63+
)
64+
65+
by_name_comp: dict[tuple[str, str], list[str]] = {}
66+
for met in model.metabolites:
67+
by_name_comp.setdefault((met.name, met.compartment), []).append(met.id)
68+
for (name, comp), ids in by_name_comp.items():
69+
if name and len(ids) > 1:
70+
issues.append(
71+
ModelIssue(
72+
"duplicate_name_compartment",
73+
None,
74+
f"{len(ids)} metabolites share name {name!r} in compartment {comp!r}: {sorted(ids)}",
75+
)
76+
)
77+
78+
objective_rxns = [r.id for r in model.reactions if r.objective_coefficient != 0]
79+
if not objective_rxns:
80+
issues.append(ModelIssue("objective", None, "No reaction has a nonzero objective coefficient."))
81+
elif len(objective_rxns) > 1:
82+
issues.append(
83+
ModelIssue("objective", None, f"Multiple objective reactions: {sorted(objective_rxns)}")
84+
)
85+
86+
return issues

0 commit comments

Comments
 (0)