|
| 1 | +"""Add reactions to a model from equation strings. |
| 2 | +
|
| 3 | +Most of the equivalent MATLAB code is struct-of-arrays bookkeeping (padding parallel |
| 4 | +``rxnNames`` / ``lb`` / ``ub`` / ``grRules`` / ... fields) that does not exist in |
| 5 | +cobra, where each ``Reaction`` carries its own attributes. cobra also already |
| 6 | +covers a large part of the *behaviour*: |
| 7 | +
|
| 8 | +* ``Reaction.build_reaction_from_string`` parses equation strings, coefficients, |
| 9 | + and reversibility arrows (``<=>``, ``-->``, ``=>``) and creates unknown |
| 10 | + metabolites — but only matching metabolites **by ID**, and it leaves new |
| 11 | + metabolites with ``compartment=None``. |
| 12 | +* assigning ``reaction.gene_reaction_rule`` auto-creates ``Gene`` objects. |
| 13 | +
|
| 14 | +So this port keeps only the parts cobra lacks: |
| 15 | +
|
| 16 | +* **name-based matching** — interpret equation tokens as metabolite *names* |
| 17 | + (RAVEN eqnType 2) or as ``name[comp]`` (eqnType 3), not just IDs; |
| 18 | +* **correct compartment** assignment for newly created metabolites; |
| 19 | +* **strict policies** — optionally *error* (rather than silently create) on |
| 20 | + unknown metabolites or genes, and always error on a duplicate reaction ID |
| 21 | + (cobra silently ignores those). |
| 22 | +
|
| 23 | +Instead of RAVEN's ``eqnType`` integer (1/2/3) the matching mode is a readable |
| 24 | +keyword: ``mets_by="id"`` or ``mets_by="name"``, with ``name[comp]`` recognised |
| 25 | +automatically. See IMPROVEMENTS.md (A-series) for the rationale. |
| 26 | +""" |
| 27 | +from __future__ import annotations |
| 28 | + |
| 29 | +import re |
| 30 | +import warnings |
| 31 | +from collections import OrderedDict |
| 32 | +from collections.abc import Mapping, Sequence |
| 33 | + |
| 34 | +import cobra |
| 35 | +from cobra import Metabolite, Reaction |
| 36 | +from cobra.core.gene import GPR |
| 37 | + |
| 38 | +from raven_python.utils.parse import parse_name_comp |
| 39 | + |
| 40 | +# Reversibility arrows. ``<=>`` must be tried before ``=>`` (it contains it). |
| 41 | +_REVERSIBLE_ARROWS = ("<=>",) |
| 42 | +_FORWARD_ARROWS = ("-->", "->", "=>") |
| 43 | + |
| 44 | + |
| 45 | +def _split_equation(equation: str) -> tuple[str, str, bool]: |
| 46 | + """Split an equation into (lhs, rhs, reversible) on its arrow.""" |
| 47 | + for arrow in _REVERSIBLE_ARROWS: |
| 48 | + if arrow in equation: |
| 49 | + lhs, rhs = equation.split(arrow, 1) |
| 50 | + return lhs, rhs, True |
| 51 | + for arrow in _FORWARD_ARROWS: |
| 52 | + if arrow in equation: |
| 53 | + lhs, rhs = equation.split(arrow, 1) |
| 54 | + return lhs, rhs, False |
| 55 | + raise ValueError(f"No reaction arrow (<=>, -->, =>) found in equation: {equation!r}") |
| 56 | + |
| 57 | + |
| 58 | +def _parse_side(side: str) -> list[tuple[float, str, str | None]]: |
| 59 | + """Parse one side of an equation into ``[(coefficient, token, fallback), ...]``. |
| 60 | +
|
| 61 | + The ``fallback`` slot is for the ambiguous ``"<number> <rest>"`` shape: when |
| 62 | + matching by name, ``"2 oxoglutarate"`` could be either ``coeff=2, name="oxoglutarate"`` |
| 63 | + or ``coeff=1, name="2 oxoglutarate"`` (a real chemistry name). We return the |
| 64 | + coefficient-split form as the primary and the full term as the fallback; the |
| 65 | + resolver picks whichever matches an existing metabolite. Pure-number heads |
| 66 | + with no name (``"2"``) and pure-name terms (``"glucose"``) have no fallback. |
| 67 | + """ |
| 68 | + terms: list[tuple[float, str, str | None]] = [] |
| 69 | + for raw in side.split(" + "): |
| 70 | + term = raw.strip() |
| 71 | + if not term: |
| 72 | + continue |
| 73 | + head, _, tail = term.partition(" ") |
| 74 | + try: |
| 75 | + coeff = float(head) |
| 76 | + token = tail.strip() |
| 77 | + except ValueError: |
| 78 | + coeff, token = 1.0, term |
| 79 | + fallback = None |
| 80 | + else: |
| 81 | + # Coefficient-split succeeded. Keep the full term as a fallback when |
| 82 | + # the tail is non-empty so name-resolution can re-try it as one token. |
| 83 | + fallback = term if token else None |
| 84 | + if not token: |
| 85 | + raise ValueError(f"Missing metabolite after coefficient in term: {raw!r}") |
| 86 | + terms.append((coeff, token, fallback)) |
| 87 | + return terms |
| 88 | + |
| 89 | + |
| 90 | +def _new_met_id(model: cobra.Model, prefix: str) -> str: |
| 91 | + """Next free ``<prefix><int>`` metabolite ID (RAVEN m1, m2, ... scheme).""" |
| 92 | + pattern = re.compile(rf"^{re.escape(prefix)}(\d+)$") |
| 93 | + used = [int(m.group(1)) for met in model.metabolites if (m := pattern.match(met.id))] |
| 94 | + n = max(used) + 1 if used else 1 |
| 95 | + while f"{prefix}{n}" in model.metabolites: |
| 96 | + n += 1 |
| 97 | + return f"{prefix}{n}" |
| 98 | + |
| 99 | + |
| 100 | +def _try_existing( |
| 101 | + model: cobra.Model, token: str, *, mets_by: str, compartment: str | None |
| 102 | +) -> Metabolite | None: |
| 103 | + """Look up ``token`` as an existing metabolite (no creation, no side effects). |
| 104 | +
|
| 105 | + Returns the matching metabolite or ``None``. Used by ``_stoichiometry`` to |
| 106 | + disambiguate the ``"<number> <rest>"`` shape: if a metabolite whose *name* |
| 107 | + (or id) literally contains a leading number exists, prefer it over splitting |
| 108 | + the number off as a coefficient. |
| 109 | + """ |
| 110 | + name, comp = parse_name_comp(token) |
| 111 | + if mets_by == "id" and comp is None: |
| 112 | + return model.metabolites.get_by_id(token) if token in model.metabolites else None |
| 113 | + target_comp = comp if comp is not None else compartment |
| 114 | + if target_comp is None: |
| 115 | + return None |
| 116 | + for met in model.metabolites: |
| 117 | + if met.name == name and met.compartment == target_comp: |
| 118 | + return met |
| 119 | + return None |
| 120 | + |
| 121 | + |
| 122 | +def _resolve_metabolite( |
| 123 | + model: cobra.Model, |
| 124 | + token: str, |
| 125 | + *, |
| 126 | + mets_by: str, |
| 127 | + compartment: str | None, |
| 128 | + allow_new_mets: bool, |
| 129 | + new_met_prefix: str, |
| 130 | +) -> Metabolite: |
| 131 | + """Resolve an equation token to an existing or newly created Metabolite.""" |
| 132 | + name, comp = parse_name_comp(token) |
| 133 | + |
| 134 | + if mets_by == "id" and comp is None: |
| 135 | + # token is a metabolite ID |
| 136 | + if token in model.metabolites: |
| 137 | + return model.metabolites.get_by_id(token) |
| 138 | + if not allow_new_mets: |
| 139 | + raise ValueError( |
| 140 | + f"Unknown metabolite ID {token!r}; pass allow_new_mets=True to create it." |
| 141 | + ) |
| 142 | + if compartment is None: |
| 143 | + raise ValueError( |
| 144 | + f"Cannot create metabolite {token!r}: no compartment given." |
| 145 | + ) |
| 146 | + _warn_unknown_compartment(model, compartment, token) |
| 147 | + met = Metabolite(token, compartment=compartment) |
| 148 | + model.add_metabolites([met]) |
| 149 | + return met |
| 150 | + |
| 151 | + # name-based (mets_by="name") or explicit name[comp] |
| 152 | + target_comp = comp if comp is not None else compartment |
| 153 | + if target_comp is None: |
| 154 | + raise ValueError( |
| 155 | + f"Metabolite {token!r} matched by name needs a compartment; " |
| 156 | + "pass compartment=... or use the name[comp] syntax." |
| 157 | + ) |
| 158 | + if comp is not None and target_comp not in model.compartments and not allow_new_mets: |
| 159 | + raise ValueError(f"Compartment {target_comp!r} is not in the model.") |
| 160 | + |
| 161 | + matches = [ |
| 162 | + met |
| 163 | + for met in model.metabolites |
| 164 | + if met.name == name and met.compartment == target_comp |
| 165 | + ] |
| 166 | + if matches: |
| 167 | + return matches[0] |
| 168 | + if not allow_new_mets: |
| 169 | + raise ValueError( |
| 170 | + f"No metabolite named {name!r} in compartment {target_comp!r}; " |
| 171 | + "pass allow_new_mets=True to create it." |
| 172 | + ) |
| 173 | + _warn_unknown_compartment(model, target_comp, name) |
| 174 | + met = Metabolite(_new_met_id(model, new_met_prefix), name=name, compartment=target_comp) |
| 175 | + model.add_metabolites([met]) |
| 176 | + return met |
| 177 | + |
| 178 | + |
| 179 | +def _warn_unknown_compartment(model: cobra.Model, compartment: str, identifier: str) -> None: |
| 180 | + """Warn when a new metabolite would be born into a not-yet-registered compartment. |
| 181 | +
|
| 182 | + Both ``mets_by`` paths previously created the metabolite without validating |
| 183 | + the compartment, so a typo (``"cyto"`` for ``"c"``) silently produced a |
| 184 | + one-metabolite ghost compartment. cobra inherits the compartment from the |
| 185 | + first metabolite assigned to it, so the fix is a warning, not a hard error. |
| 186 | + """ |
| 187 | + known = set(model.compartments) | set(model._compartments) |
| 188 | + if compartment not in known: |
| 189 | + warnings.warn( |
| 190 | + f"Creating metabolite {identifier!r} in unregistered compartment " |
| 191 | + f"{compartment!r} (existing: {sorted(known) or 'none'}); " |
| 192 | + "add the compartment first or check for a typo.", |
| 193 | + stacklevel=5, |
| 194 | + ) |
| 195 | + |
| 196 | + |
| 197 | +def _stoichiometry( |
| 198 | + model: cobra.Model, |
| 199 | + equation: str, |
| 200 | + *, |
| 201 | + mets_by: str, |
| 202 | + compartment: str | None, |
| 203 | + allow_new_mets: bool, |
| 204 | + new_met_prefix: str, |
| 205 | +) -> tuple[dict[Metabolite, float], bool]: |
| 206 | + """Parse an equation into a {Metabolite: net coefficient} dict + reversibility.""" |
| 207 | + lhs, rhs, reversible = _split_equation(equation) |
| 208 | + coeffs: OrderedDict[Metabolite, float] = OrderedDict() |
| 209 | + had_terms = False |
| 210 | + for sign, side in ((-1.0, lhs), (1.0, rhs)): |
| 211 | + for coeff, token, fallback in _parse_side(side): |
| 212 | + had_terms = True |
| 213 | + # "<number> <name>" is ambiguous when the name itself starts with a |
| 214 | + # number (e.g. "2 oxoglutarate"). Prefer the full-term interpretation |
| 215 | + # when it matches an existing metabolite — otherwise fall through to |
| 216 | + # the coefficient-split form. |
| 217 | + met = None |
| 218 | + if fallback is not None: |
| 219 | + met = _try_existing( |
| 220 | + model, fallback, mets_by=mets_by, compartment=compartment |
| 221 | + ) |
| 222 | + if met is not None: |
| 223 | + coeff = 1.0 |
| 224 | + if met is None: |
| 225 | + met = _resolve_metabolite( |
| 226 | + model, |
| 227 | + token, |
| 228 | + mets_by=mets_by, |
| 229 | + compartment=compartment, |
| 230 | + allow_new_mets=allow_new_mets, |
| 231 | + new_met_prefix=new_met_prefix, |
| 232 | + ) |
| 233 | + coeffs[met] = coeffs.get(met, 0.0) + sign * coeff |
| 234 | + # Drop metabolites that net to zero (present as both substrate and product). |
| 235 | + coeffs = OrderedDict((met, c) for met, c in coeffs.items() if c != 0.0) |
| 236 | + if had_terms and not coeffs: |
| 237 | + warnings.warn( |
| 238 | + f"Equation {equation!r} has no net metabolites (all terms cancelled); " |
| 239 | + "the reaction will be added with empty stoichiometry.", |
| 240 | + stacklevel=4, |
| 241 | + ) |
| 242 | + return dict(coeffs), reversible |
| 243 | + |
| 244 | + |
| 245 | +def add_reactions_from_equations( |
| 246 | + model: cobra.Model, |
| 247 | + reactions: Sequence[Mapping], |
| 248 | + *, |
| 249 | + mets_by: str = "id", |
| 250 | + compartment: str | None = None, |
| 251 | + allow_new_mets: bool = True, |
| 252 | + allow_new_genes: bool = True, |
| 253 | + new_met_prefix: str = "m", |
| 254 | +) -> list[Reaction]: |
| 255 | + """Add reactions defined by equation strings, matching mets by ID or name. |
| 256 | + Parameters |
| 257 | + ---------- |
| 258 | + model |
| 259 | + Target ``cobra.Model``, mutated in place. |
| 260 | + reactions |
| 261 | + Sequence of mappings, one per reaction. Recognised keys: |
| 262 | +
|
| 263 | + * ``id`` (**required**) — reaction ID; must not already exist. |
| 264 | + * ``equation`` (**required**) — e.g. ``"atp_c + h2o_c <=> adp_c + pi_c"``. |
| 265 | + Use ``<=>`` for reversible, ``-->``/``->``/``=>`` for irreversible. |
| 266 | + * ``name`` — reaction name. |
| 267 | + * ``bounds`` — ``(lower, upper)`` tuple; overrides the arrow. |
| 268 | + * ``gene_reaction_rule`` — GPR string. |
| 269 | + * ``subsystem`` — subsystem name. |
| 270 | + mets_by |
| 271 | + How bare equation tokens (without ``[comp]``) are matched: |
| 272 | + ``"id"`` (RAVEN eqnType 1) or ``"name"`` (eqnType 2). A ``name[comp]`` |
| 273 | + token (eqnType 3) is always matched by name + compartment. |
| 274 | + compartment |
| 275 | + Default compartment for new metabolites and for name-matched tokens |
| 276 | + without an explicit ``[comp]``. |
| 277 | + allow_new_mets |
| 278 | + If True (default), create metabolites not found. New metabolites get |
| 279 | + ``compartment`` (id mode) or an auto ID ``m1``, ``m2``, ... (name mode). |
| 280 | + If False, an unknown metabolite raises. |
| 281 | + allow_new_genes |
| 282 | + If True (default), genes in a GPR are auto-created by cobra. If False, |
| 283 | + a GPR referencing a gene not already in the model raises. |
| 284 | + new_met_prefix |
| 285 | + Prefix for auto-generated metabolite IDs in name mode (default ``"m"``). |
| 286 | +
|
| 287 | + Returns |
| 288 | + ------- |
| 289 | + list of cobra.Reaction |
| 290 | + The reactions added, in input order. |
| 291 | + """ |
| 292 | + if mets_by not in ("id", "name"): |
| 293 | + raise ValueError(f"mets_by must be 'id' or 'name', got {mets_by!r}") |
| 294 | + |
| 295 | + known_genes = {gene.id for gene in model.genes} |
| 296 | + added: list[Reaction] = [] |
| 297 | + |
| 298 | + for spec in reactions: |
| 299 | + if "id" not in spec: |
| 300 | + raise ValueError(f"Reaction spec missing required 'id': {spec!r}") |
| 301 | + rxn_id = spec["id"] |
| 302 | + if rxn_id in model.reactions: |
| 303 | + raise ValueError( |
| 304 | + f"Reaction {rxn_id!r} already exists; use changeRxns or remove it first." |
| 305 | + ) |
| 306 | + if "equation" not in spec: |
| 307 | + raise ValueError(f"Reaction {rxn_id!r} spec missing required 'equation'.") |
| 308 | + |
| 309 | + coeffs, reversible = _stoichiometry( |
| 310 | + model, |
| 311 | + spec["equation"], |
| 312 | + mets_by=mets_by, |
| 313 | + compartment=compartment, |
| 314 | + allow_new_mets=allow_new_mets, |
| 315 | + new_met_prefix=new_met_prefix, |
| 316 | + ) |
| 317 | + |
| 318 | + rxn = Reaction(rxn_id, name=spec.get("name", "")) |
| 319 | + if "bounds" in spec: |
| 320 | + rxn.bounds = tuple(spec["bounds"]) |
| 321 | + else: |
| 322 | + config = cobra.Configuration() |
| 323 | + lower = config.lower_bound if reversible else 0.0 |
| 324 | + rxn.bounds = (lower, config.upper_bound) |
| 325 | + if "subsystem" in spec: |
| 326 | + rxn.subsystem = spec["subsystem"] |
| 327 | + |
| 328 | + model.add_reactions([rxn]) |
| 329 | + rxn.add_metabolites(coeffs) |
| 330 | + |
| 331 | + rule = spec.get("gene_reaction_rule", "") |
| 332 | + if rule: |
| 333 | + if not allow_new_genes: |
| 334 | + missing = sorted(set(GPR.from_string(rule).genes) - known_genes) |
| 335 | + if missing: |
| 336 | + raise ValueError( |
| 337 | + f"Reaction {rxn_id!r} references genes not in the model: " |
| 338 | + f"{missing}. Set allow_new_genes=True or add them first." |
| 339 | + ) |
| 340 | + rxn.gene_reaction_rule = rule |
| 341 | + known_genes.update(gene.id for gene in rxn.genes) |
| 342 | + |
| 343 | + added.append(rxn) |
| 344 | + |
| 345 | + return added |
0 commit comments