SysBioChalmers
diff --git a/‎src/raven_python/utils/__init__.py‎
Lines changed: 16 additions & 0 deletions b/‎src/raven_python/utils/__init__.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎src/raven_python/utils/balance.py‎
Lines changed: 89 additions & 0 deletions b/‎src/raven_python/utils/balance.py‎
Lines changed: 89 additions & 0 deletions
diff --git a/‎src/raven_python/utils/gpr.py‎
Lines changed: 119 additions & 0 deletions b/‎src/raven_python/utils/gpr.py‎
Lines changed: 119 additions & 0 deletions
diff --git a/‎src/raven_python/utils/parse.py‎
Lines changed: 33 additions & 0 deletions b/‎src/raven_python/utils/parse.py‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎src/raven_python/utils/sort.py‎
Lines changed: 21 additions & 0 deletions b/‎src/raven_python/utils/sort.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎src/raven_python/utils/validate.py‎
Lines changed: 86 additions & 0 deletions b/‎src/raven_python/utils/validate.py‎
Lines changed: 86 additions & 0 deletions
@@ -0,0 +1,16 @@
+"""Shared helpers — GPR linting, elemental balance, model curation checks, id sorting."""
+from raven_python.utils.balance import ElementalBalance, get_elemental_balance
+from raven_python.utils.gpr import GPRIssue, find_non_dnf_grrules, is_dnf
+from raven_python.utils.sort import sort_identifiers
+from raven_python.utils.validate import ModelIssue, check_model
+
+__all__ = [
+    "ElementalBalance",
+    "GPRIssue",
+    "ModelIssue",
+    "check_model",
+    "find_non_dnf_grrules",
+    "get_elemental_balance",
+    "is_dnf",
+    "sort_identifiers",
+]
@@ -0,0 +1,89 @@
+"""Check the elemental balance of reactions, distinguishing *unbalanced* from
+*unknown* (missing formula).
+
+cobra's ``reaction.check_mass_balance()`` silently treats a missing formula as
+empty, so a reaction can look "unbalanced" — or even balanced — when the truth is
+that the data is incomplete. This module checks for missing formulas first and
+returns a graded status
+per reaction (``balanced`` / ``unbalanced`` / ``unknown``) plus the element
+imbalance — over a batch, as structured data.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+import cobra
+
+
+@dataclass(frozen=True)
+class ElementalBalance:
+    """Balance result for one reaction.
+
+    Attributes
+    ----------
+    reaction_id
+        ID of the reaction.
+    status
+        ``"balanced"`` — elements balance;
+        ``"unbalanced"`` — they do not (see ``imbalance``);
+        ``"unknown"`` — at least one metabolite has no formula, so it cannot be
+        determined (cobra would silently miscount these).
+    imbalance
+        Element → net coefficient (products − reactants), only for
+        ``"unbalanced"``; empty otherwise. Charge is not included.
+    """
+
+    reaction_id: str
+    status: str
+    imbalance: dict[str, float] = field(default_factory=dict)
+
+
+def get_elemental_balance(
+    model: cobra.Model, reactions=None
+) -> list[ElementalBalance]:
+    """Check whether reactions are elementally balanced.
+    Parameters
+    ----------
+    reactions
+        Reaction IDs/objects to check; default all reactions. (Boundary
+        reactions exchange mass with the environment and will read as
+        ``unbalanced`` — filter them out if that is not wanted.)
+
+    Returns
+    -------
+    list of ElementalBalance
+        One entry per checked reaction, in model order.
+    """
+    if reactions is None:
+        rxns = list(model.reactions)
+    else:
+        if isinstance(reactions, (str, cobra.Reaction)):
+            reactions = [reactions]
+        rxns = [
+            r if isinstance(r, cobra.Reaction) else model.reactions.get_by_id(r)
+            for r in reactions
+        ]
+
+    results: list[ElementalBalance] = []
+    for rxn in rxns:
+        if not rxn.metabolites:
+            # A reaction with no metabolites used to fall through to ``balanced``
+            # (vacuously) because ``any()`` over the empty list is False and the
+            # zero-element imbalance dict is empty. Treat the no-formula case
+            # (zero formulae present) as ``unknown``: we can't determine balance
+            # for a reaction without stoichiometry.
+            results.append(ElementalBalance(rxn.id, "unknown"))
+            continue
+        if any(not met.formula for met in rxn.metabolites):
+            results.append(ElementalBalance(rxn.id, "unknown"))
+            continue
+        imbalance = {
+            element: amount
+            for element, amount in rxn.check_mass_balance().items()
+            if element != "charge"
+        }
+        if imbalance:
+            results.append(ElementalBalance(rxn.id, "unbalanced", imbalance))
+        else:
+            results.append(ElementalBalance(rxn.id, "balanced"))
+    return results
@@ -0,0 +1,119 @@
+"""GPR (gene-protein-reaction rule) linting.
+
+Flag GPRs that are *not* in disjunctive normal form ("OR of AND-complexes"), via cobra's
+GPR AST. GPR syntax *normalisation* is already done by cobra on assignment, so it isn't
+re-implemented here.
+
+Part (2) has no cobrapy equivalent and is ported here, reworked onto cobra's
+GPR AST instead of RAVEN's brittle substring search. The relevant property is
+**disjunctive normal form (DNF)**: an OR of AND-clauses of single genes, e.g.
+``(G1 and G2) or G3``. Rules where an AND contains an OR — e.g.
+``(G1 or G2) and (G3 or G4)`` — are *valid* for cobra but ambiguous for the
+isoenzyme/complex reasoning used across RAVEN/GECKO, and ``expand_model``
+(see :mod:`raven_python.manipulation.expand`) only does something for DNF rules.
+:func:`find_non_dnf_grrules` surfaces them as structured data rather than, as
+RAVEN did, only printing a warning.
+"""
+from __future__ import annotations
+
+import ast
+from dataclasses import dataclass
+
+import cobra
+from cobra.core.gene import GPR
+
+
+def _contains_or(node: ast.AST | None) -> bool:
+    """True if ``node``'s subtree contains an OR operator anywhere."""
+    if isinstance(node, ast.BoolOp):
+        if isinstance(node.op, ast.Or):
+            return True
+        return any(_contains_or(value) for value in node.values)
+    return False
+
+
+def _is_dnf_node(node: ast.AST | None) -> bool:
+    """True if the AST rooted at ``node`` is in disjunctive normal form.
+
+    DNF here means no AND operator has an OR anywhere beneath it, i.e. the
+    rule is a single gene, a pure AND-complex, or an OR of those.
+    """
+    if node is None or isinstance(node, ast.Name):
+        return True
+    if isinstance(node, ast.BoolOp):
+        if isinstance(node.op, ast.And):
+            return not any(_contains_or(value) for value in node.values)
+        # OR: every disjunct must itself be DNF
+        return all(_is_dnf_node(value) for value in node.values)
+    # Unknown node type: don't flag it as a problem.
+    return True
+
+
+def is_dnf(gpr: GPR | str | None) -> bool:
+    """Return whether a GPR is in disjunctive normal form (OR of AND-complexes).
+
+    Parameters
+    ----------
+    gpr
+        A cobra :class:`~cobra.core.gene.GPR`, a grRule string, or ``None``.
+        An empty/``None`` rule is trivially DNF.
+
+    Examples
+    --------
+    >>> is_dnf("(G1 and G2) or G3")
+    True
+    >>> is_dnf("(G1 or G2) and G3")
+    False
+    """
+    if isinstance(gpr, str):
+        gpr = GPR.from_string(gpr)
+    if gpr is None:
+        return True
+    return _is_dnf_node(gpr.body)
+
+
+@dataclass(frozen=True)
+class GPRIssue:
+    """A reaction whose GPR is flagged by the linter.
+
+    Attributes
+    ----------
+    reaction_id
+        ID of the reaction.
+    gpr
+        The (already cobra-normalised) grRule string.
+    reason
+        Human-readable explanation of why it was flagged.
+    """
+
+    reaction_id: str
+    gpr: str
+    reason: str
+
+
+_NON_DNF_REASON = (
+    "GPR is not in disjunctive normal form (an AND clause contains an OR). "
+    "Isoenzyme/complex reasoning and expand_model assume an OR of AND-complexes, "
+    'e.g. rewrite "(G1 or G2) and (G3 or G4)" as '
+    '"(G1 and G3) or (G1 and G4) or (G2 and G3) or (G2 and G4)".'
+)
+
+
+def find_non_dnf_grrules(model: cobra.Model) -> list[GPRIssue]:
+    """Find reactions whose GPR is not in disjunctive normal form ("OR of AND-complexes").
+
+    Uses cobra's GPR AST. Reactions with no GPR are skipped.
+
+    Returns
+    -------
+    list of GPRIssue
+        One entry per flagged reaction, in model reaction order. Empty if all
+        GPRs are simple OR-of-AND-complexes.
+    """
+    issues: list[GPRIssue] = []
+    for rxn in model.reactions:
+        if not rxn.gene_reaction_rule:
+            continue
+        if not is_dnf(rxn.gpr):
+            issues.append(GPRIssue(rxn.id, rxn.gene_reaction_rule, _NON_DNF_REASON))
+    return issues
@@ -0,0 +1,33 @@
+"""Small parsing helpers shared across raven_python."""
+from __future__ import annotations
+
+import re
+
+# A metabolite written as ``name[comp]``. The name is greedy so that, for a
+# pathological name that itself contains brackets, the *last* ``[...]`` is taken
+# as the compartment (matching RAVEN getIndexes' ``max(strfind('['))`` rule).
+_NAME_COMP_RE = re.compile(r"^(?P<name>.+)\[(?P<comp>[^\[\]]+)\]$")
+
+
+def parse_name_comp(token: str) -> tuple[str, str | None]:
+    """Split a ``name[comp]`` token into ``(name, compartment)``.
+
+    This is the one genuinely cobra-absent sliver of RAVEN ``getIndexes``'
+    ``metcomps`` mode and ``addRxns`` eqnType 3: resolving a metabolite written
+    as its *name* plus a compartment in square brackets, e.g. ``"ATP[c]"``.
+
+    Returns ``(name, None)`` when there is no trailing ``[...]``.
+
+    Examples
+    --------
+    >>> parse_name_comp("ATP[c]")
+    ('ATP', 'c')
+    >>> parse_name_comp("ATP")
+    ('ATP', None)
+    >>> parse_name_comp("weird[name][m]")
+    ('weird[name]', 'm')
+    """
+    match = _NAME_COMP_RE.match(token.strip())
+    if match:
+        return match.group("name").strip(), match.group("comp").strip()
+    return token.strip(), None
@@ -0,0 +1,21 @@
+"""Sort a model's identifiers alphabetically — useful for deterministic,
+diff-friendly output.
+
+cobra's ``DictList.sort`` reorders one list (and rebuilds its lookup index), but
+there is no single "sort the whole model" call; this provides it.
+"""
+from __future__ import annotations
+
+import cobra
+
+
+def sort_identifiers(model: cobra.Model) -> cobra.Model:
+    """Sort reactions, metabolites and genes alphabetically by ID, in place.
+
+    Returns the same (mutated) model for convenience. Compartments are a plain
+    dict and are emitted sorted by writers as needed.
+    """
+    model.reactions.sort(key=lambda r: r.id)
+    model.metabolites.sort(key=lambda m: m.id)
+    model.genes.sort(key=lambda g: g.id)
+    return model
@@ -0,0 +1,86 @@
+"""Curation checks for a model.
+
+A QC bundle cobra has no single call for: orphaned objects, empty reactions,
+duplicated metabolite ``name + compartment``, empty names, and objective sanity.
+:func:`check_model` returns these as structured :class:`ModelIssue` records.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import cobra
+
+
+@dataclass(frozen=True)
+class ModelIssue:
+    """One curation issue found in a model.
+
+    Attributes
+    ----------
+    category
+        Machine-readable kind, e.g. ``"orphan_metabolite"``, ``"empty_reaction"``,
+        ``"orphan_gene"``, ``"duplicate_name_compartment"``,
+        ``"empty_metabolite_name"``, ``"objective"``.
+    object_id
+        ID of the offending object, or ``None`` for model-level issues.
+    message
+        Human-readable description.
+    """
+
+    category: str
+    object_id: str | None
+    message: str
+
+
+def check_model(model: cobra.Model) -> list[ModelIssue]:
+    """Run curation checks on a model and return the issues found.
+
+    Does not
+    raise; returns a (possibly empty) list of :class:`ModelIssue`.
+    """
+    issues: list[ModelIssue] = []
+
+    for met in model.metabolites:
+        if not met.reactions:
+            issues.append(
+                ModelIssue("orphan_metabolite", met.id, f"Metabolite {met.id!r} is not used in any reaction.")
+            )
+        if not (met.name and str(met.name).strip()):
+            issues.append(
+                ModelIssue("empty_metabolite_name", met.id, f"Metabolite {met.id!r} has no name.")
+            )
+
+    for gene in model.genes:
+        if not gene.reactions:
+            issues.append(
+                ModelIssue("orphan_gene", gene.id, f"Gene {gene.id!r} is not associated with any reaction.")
+            )
+
+    for rxn in model.reactions:
+        if not rxn.metabolites:
+            issues.append(
+                ModelIssue("empty_reaction", rxn.id, f"Reaction {rxn.id!r} has no metabolites.")
+            )
+
+    by_name_comp: dict[tuple[str, str], list[str]] = {}
+    for met in model.metabolites:
+        by_name_comp.setdefault((met.name, met.compartment), []).append(met.id)
+    for (name, comp), ids in by_name_comp.items():
+        if name and len(ids) > 1:
+            issues.append(
+                ModelIssue(
+                    "duplicate_name_compartment",
+                    None,
+                    f"{len(ids)} metabolites share name {name!r} in compartment {comp!r}: {sorted(ids)}",
+                )
+            )
+
+    objective_rxns = [r.id for r in model.reactions if r.objective_coefficient != 0]
+    if not objective_rxns:
+        issues.append(ModelIssue("objective", None, "No reaction has a nonzero objective coefficient."))
+    elif len(objective_rxns) > 1:
+        issues.append(
+            ModelIssue("objective", None, f"Multiple objective reactions: {sorted(objective_rxns)}")
+        )
+
+    return issues