From 50bea4006f174a3549e670fd3ea622f8e31083cf Mon Sep 17 00:00:00 2001
From: Eduard Kerkhoven <eduardk@chalmers.se>
Date: Fri, 29 May 2026 22:46:42 +0200
Subject: [PATCH 1/3] Add the foundation utilities: GPR, balance, parse, sort,
 validate

---
 src/raven_python/utils/__init__.py |  16 ++++
 src/raven_python/utils/balance.py  |  89 +++++++++++++++++++++
 src/raven_python/utils/gpr.py      | 119 +++++++++++++++++++++++++++++
 src/raven_python/utils/parse.py    |  33 ++++++++
 src/raven_python/utils/sort.py     |  21 +++++
 src/raven_python/utils/validate.py |  86 +++++++++++++++++++++
 tests/test_utils_balance.py        |  76 ++++++++++++++++++
 tests/test_utils_gpr.py            |  84 ++++++++++++++++++++
 tests/test_utils_sort.py           |  42 ++++++++++
 tests/test_utils_validate.py       |  80 +++++++++++++++++++
 10 files changed, 646 insertions(+)
 create mode 100644 src/raven_python/utils/__init__.py
 create mode 100644 src/raven_python/utils/balance.py
 create mode 100644 src/raven_python/utils/gpr.py
 create mode 100644 src/raven_python/utils/parse.py
 create mode 100644 src/raven_python/utils/sort.py
 create mode 100644 src/raven_python/utils/validate.py
 create mode 100644 tests/test_utils_balance.py
 create mode 100644 tests/test_utils_gpr.py
 create mode 100644 tests/test_utils_sort.py
 create mode 100644 tests/test_utils_validate.py

diff --git a/src/raven_python/utils/__init__.py b/src/raven_python/utils/__init__.py
new file mode 100644
index 0000000..7127bdd
--- /dev/null
+++ b/src/raven_python/utils/__init__.py
@@ -0,0 +1,16 @@
+"""Shared helpers — GPR linting, elemental balance, model curation checks, id sorting."""
+from raven_python.utils.balance import ElementalBalance, get_elemental_balance
+from raven_python.utils.gpr import GPRIssue, find_non_dnf_grrules, is_dnf
+from raven_python.utils.sort import sort_identifiers
+from raven_python.utils.validate import ModelIssue, check_model
+
+__all__ = [
+    "ElementalBalance",
+    "GPRIssue",
+    "ModelIssue",
+    "check_model",
+    "find_non_dnf_grrules",
+    "get_elemental_balance",
+    "is_dnf",
+    "sort_identifiers",
+]
diff --git a/src/raven_python/utils/balance.py b/src/raven_python/utils/balance.py
new file mode 100644
index 0000000..ee64ab4
--- /dev/null
+++ b/src/raven_python/utils/balance.py
@@ -0,0 +1,89 @@
+"""Check the elemental balance of reactions, distinguishing *unbalanced* from
+*unknown* (missing formula).
+
+cobra's ``reaction.check_mass_balance()`` silently treats a missing formula as
+empty, so a reaction can look "unbalanced" — or even balanced — when the truth is
+that the data is incomplete. This module checks for missing formulas first and
+returns a graded status
+per reaction (``balanced`` / ``unbalanced`` / ``unknown``) plus the element
+imbalance — over a batch, as structured data.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+import cobra
+
+
+@dataclass(frozen=True)
+class ElementalBalance:
+    """Balance result for one reaction.
+
+    Attributes
+    ----------
+    reaction_id
+        ID of the reaction.
+    status
+        ``"balanced"`` — elements balance;
+        ``"unbalanced"`` — they do not (see ``imbalance``);
+        ``"unknown"`` — at least one metabolite has no formula, so it cannot be
+        determined (cobra would silently miscount these).
+    imbalance
+        Element → net coefficient (products − reactants), only for
+        ``"unbalanced"``; empty otherwise. Charge is not included.
+    """
+
+    reaction_id: str
+    status: str
+    imbalance: dict[str, float] = field(default_factory=dict)
+
+
+def get_elemental_balance(
+    model: cobra.Model, reactions=None
+) -> list[ElementalBalance]:
+    """Check whether reactions are elementally balanced.
+    Parameters
+    ----------
+    reactions
+        Reaction IDs/objects to check; default all reactions. (Boundary
+        reactions exchange mass with the environment and will read as
+        ``unbalanced`` — filter them out if that is not wanted.)
+
+    Returns
+    -------
+    list of ElementalBalance
+        One entry per checked reaction, in model order.
+    """
+    if reactions is None:
+        rxns = list(model.reactions)
+    else:
+        if isinstance(reactions, (str, cobra.Reaction)):
+            reactions = [reactions]
+        rxns = [
+            r if isinstance(r, cobra.Reaction) else model.reactions.get_by_id(r)
+            for r in reactions
+        ]
+
+    results: list[ElementalBalance] = []
+    for rxn in rxns:
+        if not rxn.metabolites:
+            # A reaction with no metabolites used to fall through to ``balanced``
+            # (vacuously) because ``any()`` over the empty list is False and the
+            # zero-element imbalance dict is empty. Treat the no-formula case
+            # (zero formulae present) as ``unknown``: we can't determine balance
+            # for a reaction without stoichiometry.
+            results.append(ElementalBalance(rxn.id, "unknown"))
+            continue
+        if any(not met.formula for met in rxn.metabolites):
+            results.append(ElementalBalance(rxn.id, "unknown"))
+            continue
+        imbalance = {
+            element: amount
+            for element, amount in rxn.check_mass_balance().items()
+            if element != "charge"
+        }
+        if imbalance:
+            results.append(ElementalBalance(rxn.id, "unbalanced", imbalance))
+        else:
+            results.append(ElementalBalance(rxn.id, "balanced"))
+    return results
diff --git a/src/raven_python/utils/gpr.py b/src/raven_python/utils/gpr.py
new file mode 100644
index 0000000..2e2122d
--- /dev/null
+++ b/src/raven_python/utils/gpr.py
@@ -0,0 +1,119 @@
+"""GPR (gene-protein-reaction rule) linting.
+
+Flag GPRs that are *not* in disjunctive normal form ("OR of AND-complexes"), via cobra's
+GPR AST. GPR syntax *normalisation* is already done by cobra on assignment, so it isn't
+re-implemented here.
+
+Part (2) has no cobrapy equivalent and is ported here, reworked onto cobra's
+GPR AST instead of RAVEN's brittle substring search. The relevant property is
+**disjunctive normal form (DNF)**: an OR of AND-clauses of single genes, e.g.
+``(G1 and G2) or G3``. Rules where an AND contains an OR — e.g.
+``(G1 or G2) and (G3 or G4)`` — are *valid* for cobra but ambiguous for the
+isoenzyme/complex reasoning used across RAVEN/GECKO, and ``expand_model``
+(see :mod:`raven_python.manipulation.expand`) only does something for DNF rules.
+:func:`find_non_dnf_grrules` surfaces them as structured data rather than, as
+RAVEN did, only printing a warning.
+"""
+from __future__ import annotations
+
+import ast
+from dataclasses import dataclass
+
+import cobra
+from cobra.core.gene import GPR
+
+
+def _contains_or(node: ast.AST | None) -> bool:
+    """True if ``node``'s subtree contains an OR operator anywhere."""
+    if isinstance(node, ast.BoolOp):
+        if isinstance(node.op, ast.Or):
+            return True
+        return any(_contains_or(value) for value in node.values)
+    return False
+
+
+def _is_dnf_node(node: ast.AST | None) -> bool:
+    """True if the AST rooted at ``node`` is in disjunctive normal form.
+
+    DNF here means no AND operator has an OR anywhere beneath it, i.e. the
+    rule is a single gene, a pure AND-complex, or an OR of those.
+    """
+    if node is None or isinstance(node, ast.Name):
+        return True
+    if isinstance(node, ast.BoolOp):
+        if isinstance(node.op, ast.And):
+            return not any(_contains_or(value) for value in node.values)
+        # OR: every disjunct must itself be DNF
+        return all(_is_dnf_node(value) for value in node.values)
+    # Unknown node type: don't flag it as a problem.
+    return True
+
+
+def is_dnf(gpr: GPR | str | None) -> bool:
+    """Return whether a GPR is in disjunctive normal form (OR of AND-complexes).
+
+    Parameters
+    ----------
+    gpr
+        A cobra :class:`~cobra.core.gene.GPR`, a grRule string, or ``None``.
+        An empty/``None`` rule is trivially DNF.
+
+    Examples
+    --------
+    >>> is_dnf("(G1 and G2) or G3")
+    True
+    >>> is_dnf("(G1 or G2) and G3")
+    False
+    """
+    if isinstance(gpr, str):
+        gpr = GPR.from_string(gpr)
+    if gpr is None:
+        return True
+    return _is_dnf_node(gpr.body)
+
+
+@dataclass(frozen=True)
+class GPRIssue:
+    """A reaction whose GPR is flagged by the linter.
+
+    Attributes
+    ----------
+    reaction_id
+        ID of the reaction.
+    gpr
+        The (already cobra-normalised) grRule string.
+    reason
+        Human-readable explanation of why it was flagged.
+    """
+
+    reaction_id: str
+    gpr: str
+    reason: str
+
+
+_NON_DNF_REASON = (
+    "GPR is not in disjunctive normal form (an AND clause contains an OR). "
+    "Isoenzyme/complex reasoning and expand_model assume an OR of AND-complexes, "
+    'e.g. rewrite "(G1 or G2) and (G3 or G4)" as '
+    '"(G1 and G3) or (G1 and G4) or (G2 and G3) or (G2 and G4)".'
+)
+
+
+def find_non_dnf_grrules(model: cobra.Model) -> list[GPRIssue]:
+    """Find reactions whose GPR is not in disjunctive normal form ("OR of AND-complexes").
+
+    Uses cobra's GPR AST. Reactions with no GPR are skipped.
+
+    Returns
+    -------
+    list of GPRIssue
+        One entry per flagged reaction, in model reaction order. Empty if all
+        GPRs are simple OR-of-AND-complexes.
+    """
+    issues: list[GPRIssue] = []
+    for rxn in model.reactions:
+        if not rxn.gene_reaction_rule:
+            continue
+        if not is_dnf(rxn.gpr):
+            issues.append(GPRIssue(rxn.id, rxn.gene_reaction_rule, _NON_DNF_REASON))
+    return issues
diff --git a/src/raven_python/utils/parse.py b/src/raven_python/utils/parse.py
new file mode 100644
index 0000000..8068f6c
--- /dev/null
+++ b/src/raven_python/utils/parse.py
@@ -0,0 +1,33 @@
+"""Small parsing helpers shared across raven_python."""
+from __future__ import annotations
+
+import re
+
+# A metabolite written as ``name[comp]``. The name is greedy so that, for a
+# pathological name that itself contains brackets, the *last* ``[...]`` is taken
+# as the compartment (matching RAVEN getIndexes' ``max(strfind('['))`` rule).
+_NAME_COMP_RE = re.compile(r"^(?P<name>.+)\[(?P<comp>[^\[\]]+)\]$")
+
+
+def parse_name_comp(token: str) -> tuple[str, str | None]:
+    """Split a ``name[comp]`` token into ``(name, compartment)``.
+
+    This is the one genuinely cobra-absent sliver of RAVEN ``getIndexes``'
+    ``metcomps`` mode and ``addRxns`` eqnType 3: resolving a metabolite written
+    as its *name* plus a compartment in square brackets, e.g. ``"ATP[c]"``.
+
+    Returns ``(name, None)`` when there is no trailing ``[...]``.
+
+    Examples
+    --------
+    >>> parse_name_comp("ATP[c]")
+    ('ATP', 'c')
+    >>> parse_name_comp("ATP")
+    ('ATP', None)
+    >>> parse_name_comp("weird[name][m]")
+    ('weird[name]', 'm')
+    """
+    match = _NAME_COMP_RE.match(token.strip())
+    if match:
+        return match.group("name").strip(), match.group("comp").strip()
+    return token.strip(), None
diff --git a/src/raven_python/utils/sort.py b/src/raven_python/utils/sort.py
new file mode 100644
index 0000000..a8641a8
--- /dev/null
+++ b/src/raven_python/utils/sort.py
@@ -0,0 +1,21 @@
+"""Sort a model's identifiers alphabetically — useful for deterministic,
+diff-friendly output.
+
+cobra's ``DictList.sort`` reorders one list (and rebuilds its lookup index), but
+there is no single "sort the whole model" call; this provides it.
+"""
+from __future__ import annotations
+
+import cobra
+
+
+def sort_identifiers(model: cobra.Model) -> cobra.Model:
+    """Sort reactions, metabolites and genes alphabetically by ID, in place.
+
+    Returns the same (mutated) model for convenience. Compartments are a plain
+    dict and are emitted sorted by writers as needed.
+    """
+    model.reactions.sort(key=lambda r: r.id)
+    model.metabolites.sort(key=lambda m: m.id)
+    model.genes.sort(key=lambda g: g.id)
+    return model
diff --git a/src/raven_python/utils/validate.py b/src/raven_python/utils/validate.py
new file mode 100644
index 0000000..c08df48
--- /dev/null
+++ b/src/raven_python/utils/validate.py
@@ -0,0 +1,86 @@
+"""Curation checks for a model.
+
+A QC bundle cobra has no single call for: orphaned objects, empty reactions,
+duplicated metabolite ``name + compartment``, empty names, and objective sanity.
+:func:`check_model` returns these as structured :class:`ModelIssue` records.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import cobra
+
+
+@dataclass(frozen=True)
+class ModelIssue:
+    """One curation issue found in a model.
+
+    Attributes
+    ----------
+    category
+        Machine-readable kind, e.g. ``"orphan_metabolite"``, ``"empty_reaction"``,
+        ``"orphan_gene"``, ``"duplicate_name_compartment"``,
+        ``"empty_metabolite_name"``, ``"objective"``.
+    object_id
+        ID of the offending object, or ``None`` for model-level issues.
+    message
+        Human-readable description.
+    """
+
+    category: str
+    object_id: str | None
+    message: str
+
+
+def check_model(model: cobra.Model) -> list[ModelIssue]:
+    """Run curation checks on a model and return the issues found.
+
+    Does not
+    raise; returns a (possibly empty) list of :class:`ModelIssue`.
+    """
+    issues: list[ModelIssue] = []
+
+    for met in model.metabolites:
+        if not met.reactions:
+            issues.append(
+                ModelIssue("orphan_metabolite", met.id, f"Metabolite {met.id!r} is not used in any reaction.")
+            )
+        if not (met.name and str(met.name).strip()):
+            issues.append(
+                ModelIssue("empty_metabolite_name", met.id, f"Metabolite {met.id!r} has no name.")
+            )
+
+    for gene in model.genes:
+        if not gene.reactions:
+            issues.append(
+                ModelIssue("orphan_gene", gene.id, f"Gene {gene.id!r} is not associated with any reaction.")
+            )
+
+    for rxn in model.reactions:
+        if not rxn.metabolites:
+            issues.append(
+                ModelIssue("empty_reaction", rxn.id, f"Reaction {rxn.id!r} has no metabolites.")
+            )
+
+    by_name_comp: dict[tuple[str, str], list[str]] = {}
+    for met in model.metabolites:
+        by_name_comp.setdefault((met.name, met.compartment), []).append(met.id)
+    for (name, comp), ids in by_name_comp.items():
+        if name and len(ids) > 1:
+            issues.append(
+                ModelIssue(
+                    "duplicate_name_compartment",
+                    None,
+                    f"{len(ids)} metabolites share name {name!r} in compartment {comp!r}: {sorted(ids)}",
+                )
+            )
+
+    objective_rxns = [r.id for r in model.reactions if r.objective_coefficient != 0]
+    if not objective_rxns:
+        issues.append(ModelIssue("objective", None, "No reaction has a nonzero objective coefficient."))
+    elif len(objective_rxns) > 1:
+        issues.append(
+            ModelIssue("objective", None, f"Multiple objective reactions: {sorted(objective_rxns)}")
+        )
+
+    return issues
diff --git a/tests/test_utils_balance.py b/tests/test_utils_balance.py
new file mode 100644
index 0000000..aa1e47e
--- /dev/null
+++ b/tests/test_utils_balance.py
@@ -0,0 +1,76 @@
+"""Tests for get_elemental_balance (getElementalBalance port)."""
+import cobra
+import pytest
+
+from raven_python.utils import ElementalBalance, get_elemental_balance
+
+
+@pytest.fixture
+def model():
+    m = cobra.Model("t")
+    m.add_metabolites(
+        [
+            cobra.Metabolite("a_c", formula="C6H12O6", charge=0, compartment="c"),
+            cobra.Metabolite("b_c", formula="C6H12O6", charge=0, compartment="c"),
+            cobra.Metabolite("c_c", formula="C3H6O3", charge=0, compartment="c"),
+            cobra.Metabolite("n_c", compartment="c"),  # no formula
+        ]
+    )
+    r_bal = cobra.Reaction("R_bal")
+    m.add_reactions([r_bal])
+    r_bal.build_reaction_from_string("a_c --> b_c")        # C6H12O6 -> C6H12O6
+    r_unbal = cobra.Reaction("R_unbal")
+    m.add_reactions([r_unbal])
+    r_unbal.build_reaction_from_string("a_c --> c_c")      # C6H12O6 -> C3H6O3
+    r_unknown = cobra.Reaction("R_unknown")
+    m.add_reactions([r_unknown])
+    r_unknown.build_reaction_from_string("a_c --> n_c")    # n_c has no formula
+    return m
+
+
+def test_balanced(model):
+    (res,) = get_elemental_balance(model, ["R_bal"])
+    assert res == ElementalBalance("R_bal", "balanced", {})
+
+
+def test_unbalanced_reports_imbalance(model):
+    (res,) = get_elemental_balance(model, ["R_unbal"])
+    assert res.status == "unbalanced"
+    # products - reactants: C3H6O3 - C6H12O6 = -C3H6O3
+    assert res.imbalance == {"C": -3.0, "H": -6.0, "O": -3.0}
+
+
+def test_missing_formula_is_unknown_not_silently_wrong(model):
+    # cobra's check_mass_balance alone would silently report an imbalance here;
+    # we flag it as unknown instead.
+    (res,) = get_elemental_balance(model, ["R_unknown"])
+    assert res.status == "unknown"
+    assert res.imbalance == {}
+
+
+def test_all_reactions_default(model):
+    results = get_elemental_balance(model)
+    assert {r.reaction_id: r.status for r in results} == {
+        "R_bal": "balanced",
+        "R_unbal": "unbalanced",
+        "R_unknown": "unknown",
+    }
+
+
+def test_charge_excluded(model):
+    # give a charge imbalance but keep elements balanced -> still "balanced"
+    model.metabolites.get_by_id("b_c").charge = 1
+    (res,) = get_elemental_balance(model, ["R_bal"])
+    assert res.status == "balanced"
+
+
+# --- regression: empty reaction → unknown (known_issues.md F5) -------------
+
+def test_empty_reaction_is_unknown(model):
+    """A reaction with no metabolites used to be reported `balanced`
+    vacuously (any() over an empty list is False and check_mass_balance
+    returns no imbalance). Now reports `unknown`."""
+    empty = cobra.Reaction("R_empty", lower_bound=0, upper_bound=1000)
+    model.add_reactions([empty])
+    (res,) = get_elemental_balance(model, ["R_empty"])
+    assert res.status == "unknown"
diff --git a/tests/test_utils_gpr.py b/tests/test_utils_gpr.py
new file mode 100644
index 0000000..275d020
--- /dev/null
+++ b/tests/test_utils_gpr.py
@@ -0,0 +1,84 @@
+"""Tests for raven_python.utils.gpr (GPR linting)."""
+import cobra
+import pytest
+
+from raven_python.utils import GPRIssue, find_non_dnf_grrules, is_dnf
+
+
+@pytest.mark.parametrize(
+    "rule",
+    [
+        "",
+        "G1",
+        "G1 and G2",
+        "G1 or G2",
+        "G1 and G2 and G3",
+        "G1 or G2 or G3",
+        "(G1 and G2) or G3",
+        "(G1 and G2) or (G3 and G4)",
+        "G1 or (G2 and G3)",
+    ],
+)
+def test_is_dnf_true(rule):
+    assert is_dnf(rule) is True
+
+
+@pytest.mark.parametrize(
+    "rule",
+    [
+        "(G1 or G2) and G3",
+        "G1 and (G2 or G3)",
+        "(G1 or G2) and (G3 or G4)",
+        "G1 and (G2 or (G3 and G4))",
+    ],
+)
+def test_is_dnf_false(rule):
+    assert is_dnf(rule) is False
+
+
+def test_is_dnf_accepts_gpr_and_none():
+    from cobra.core.gene import GPR
+
+    assert is_dnf(GPR.from_string("(G1 or G2) and G3")) is False
+    assert is_dnf(GPR.from_string("G1 or G2")) is True
+    assert is_dnf(None) is True
+
+
+def test_is_dnf_independent_of_formatting():
+    # cobra normalises on assignment, so casing/whitespace cannot change the verdict.
+    assert is_dnf("(G1 OR G2)   AND   G3") is False
+    assert is_dnf("( G1 and G2 )  or  G3") is True
+
+
+def _model_with_rules(rules: dict[str, str]) -> cobra.Model:
+    model = cobra.Model("t")
+    model.add_reactions([cobra.Reaction(rid) for rid in rules])
+    for rid, rule in rules.items():
+        model.reactions.get_by_id(rid).gene_reaction_rule = rule
+    return model
+
+
+def test_find_non_dnf_grrules_flags_only_offenders():
+    model = _model_with_rules(
+        {
+            "R_ok_single": "G1",
+            "R_ok_complex": "G1 and G2",
+            "R_ok_dnf": "(G1 and G2) or G3",
+            "R_no_gpr": "",
+            "R_bad_1": "(G1 or G2) and G3",
+            "R_bad_2": "(G1 or G2) and (G3 or G4)",
+        }
+    )
+
+    issues = find_non_dnf_grrules(model)
+
+    assert [i.reaction_id for i in issues] == ["R_bad_1", "R_bad_2"]
+    assert all(isinstance(i, GPRIssue) for i in issues)
+    assert all("disjunctive normal form" in i.reason for i in issues)
+    # the reported GPR is the cobra-normalised string
+    assert issues[0].gpr == "(G1 or G2) and G3"
+
+
+def test_find_non_dnf_grrules_empty_when_all_clean():
+    model = _model_with_rules({"R1": "G1 or G2", "R2": "(G1 and G2) or G3"})
+    assert find_non_dnf_grrules(model) == []
diff --git a/tests/test_utils_sort.py b/tests/test_utils_sort.py
new file mode 100644
index 0000000..18bca24
--- /dev/null
+++ b/tests/test_utils_sort.py
@@ -0,0 +1,42 @@
+"""Tests for sort_identifiers and write_yaml_model(sort_ids=True)."""
+import cobra
+
+from raven_python.io import read_yaml_model, write_yaml_model
+from raven_python.manipulation import add_reactions_from_equations
+from raven_python.utils import sort_identifiers
+
+
+def _model():
+    m = cobra.Model("t")
+    m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("b_c", "a_c")])
+    add_reactions_from_equations(
+        m,
+        [
+            {"id": "R2", "equation": "a_c --> b_c", "gene_reaction_rule": "GB"},
+            {"id": "R1", "equation": "b_c --> a_c", "gene_reaction_rule": "GA"},
+        ],
+    )
+    return m
+
+
+def test_sort_identifiers_orders_everything():
+    m = _model()
+    sort_identifiers(m)
+    assert [r.id for r in m.reactions] == ["R1", "R2"]
+    assert [x.id for x in m.metabolites] == ["a_c", "b_c"]
+    assert [g.id for g in m.genes] == ["GA", "GB"]
+    # lookup index still intact after sorting
+    assert m.reactions.get_by_id("R2").id == "R2"
+
+
+def test_write_yaml_sort_ids_does_not_mutate(tmp_path):
+    m = _model()
+    order_before = [r.id for r in m.reactions]
+    out = tmp_path / "m.yml"
+    write_yaml_model(m, out, sort_ids=True)
+    assert [r.id for r in m.reactions] == order_before  # model untouched
+    # but the file is sorted
+    text = out.read_text()
+    assert text.index("R1") < text.index("R2")
+    reloaded = read_yaml_model(out)
+    assert [r.id for r in reloaded.reactions] == ["R1", "R2"]
diff --git a/tests/test_utils_validate.py b/tests/test_utils_validate.py
new file mode 100644
index 0000000..2d38e6f
--- /dev/null
+++ b/tests/test_utils_validate.py
@@ -0,0 +1,80 @@
+"""Tests for check_model (the surviving checks of checkModelStruct)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import add_reactions_from_equations
+from raven_python.utils import ModelIssue, check_model
+
+
+def _categories(issues, category):
+    return [i.object_id for i in issues if i.category == category]
+
+
+@pytest.fixture
+def model():
+    m = cobra.Model("t")
+    m.add_metabolites(
+        [
+            cobra.Metabolite("a_c", name="A", compartment="c"),
+            cobra.Metabolite("b_c", name="B", compartment="c"),
+        ]
+    )
+    add_reactions_from_equations(
+        m, [{"id": "R1", "equation": "a_c --> b_c", "gene_reaction_rule": "G1"}]
+    )
+    m.reactions.get_by_id("R1").objective_coefficient = 1
+    return m
+
+
+def test_clean_model_has_no_issues(model):
+    assert check_model(model) == []
+
+
+def test_orphan_metabolite(model):
+    model.add_metabolites([cobra.Metabolite("orphan_c", name="Orphan", compartment="c")])
+    assert "orphan_c" in _categories(check_model(model), "orphan_metabolite")
+
+
+def test_orphan_gene(model):
+    model.genes.append(cobra.core.gene.Gene("G_lonely"))
+    assert "G_lonely" in _categories(check_model(model), "orphan_gene")
+
+
+def test_empty_reaction(model):
+    model.add_reactions([cobra.Reaction("R_empty")])
+    assert "R_empty" in _categories(check_model(model), "empty_reaction")
+
+
+def test_empty_metabolite_name(model):
+    model.add_metabolites([cobra.Metabolite("noname_c", compartment="c")])
+    # also an orphan, but we check the name category specifically
+    assert "noname_c" in _categories(check_model(model), "empty_metabolite_name")
+
+
+def test_duplicate_name_compartment(model):
+    # second metabolite named "A" in compartment c
+    dup = cobra.Metabolite("a2_c", name="A", compartment="c")
+    model.add_metabolites([dup])
+    model.reactions.get_by_id("R1").add_metabolites({dup: -1})  # keep it used
+    issues = [i for i in check_model(model) if i.category == "duplicate_name_compartment"]
+    assert len(issues) == 1
+    assert "a_c" in issues[0].message and "a2_c" in issues[0].message
+
+
+def test_no_objective(model):
+    model.reactions.get_by_id("R1").objective_coefficient = 0
+    cats = [i.category for i in check_model(model)]
+    assert "objective" in cats
+
+
+def test_multiple_objectives(model):
+    add_reactions_from_equations(model, [{"id": "R2", "equation": "b_c --> a_c"}])
+    model.reactions.get_by_id("R2").objective_coefficient = 1
+    obj_issues = [i for i in check_model(model) if i.category == "objective"]
+    assert len(obj_issues) == 1
+    assert "Multiple" in obj_issues[0].message
+
+
+def test_returns_model_issue_instances(model):
+    model.add_reactions([cobra.Reaction("R_empty")])
+    assert all(isinstance(i, ModelIssue) for i in check_model(model))

From 4bc0d6ee29070a0bad4e347a78947c7fae8199a0 Mon Sep 17 00:00:00 2001
From: Eduard Kerkhoven <eduardk@chalmers.se>
Date: Fri, 29 May 2026 22:46:59 +0200
Subject: [PATCH 2/3] Add the model-manipulation layer (add, remove, transport,
 merge, etc.)

---
 src/raven_python/manipulation/__init__.py     |  36 ++
 src/raven_python/manipulation/add.py          | 345 ++++++++++++++++++
 src/raven_python/manipulation/change.py       | 125 +++++++
 src/raven_python/manipulation/compartments.py | 196 ++++++++++
 src/raven_python/manipulation/expand.py       | 124 +++++++
 src/raven_python/manipulation/irreversible.py |  72 ++++
 src/raven_python/manipulation/merge.py        | 146 ++++++++
 src/raven_python/manipulation/parameters.py   |  78 ++++
 src/raven_python/manipulation/remove.py       | 120 ++++++
 src/raven_python/manipulation/simplify.py     | 229 ++++++++++++
 src/raven_python/manipulation/transfer.py     | 144 ++++++++
 src/raven_python/manipulation/transport.py    | 157 ++++++++
 tests/test_change_grrules.py                  |  49 +++
 tests/test_manipulation_add.py                | 278 ++++++++++++++
 tests/test_manipulation_change.py             |  93 +++++
 tests/test_manipulation_compartments.py       | 139 +++++++
 tests/test_manipulation_expand.py             | 288 +++++++++++++++
 tests/test_manipulation_irreversible.py       | 144 ++++++++
 tests/test_manipulation_merge.py              | 136 +++++++
 tests/test_manipulation_remove.py             |  97 +++++
 tests/test_manipulation_simplify.py           | 184 ++++++++++
 tests/test_manipulation_transfer.py           | 137 +++++++
 tests/test_manipulation_transport.py          |  98 +++++
 tests/test_parameters.py                      |  60 +++
 24 files changed, 3475 insertions(+)
 create mode 100644 src/raven_python/manipulation/__init__.py
 create mode 100644 src/raven_python/manipulation/add.py
 create mode 100644 src/raven_python/manipulation/change.py
 create mode 100644 src/raven_python/manipulation/compartments.py
 create mode 100644 src/raven_python/manipulation/expand.py
 create mode 100644 src/raven_python/manipulation/irreversible.py
 create mode 100644 src/raven_python/manipulation/merge.py
 create mode 100644 src/raven_python/manipulation/parameters.py
 create mode 100644 src/raven_python/manipulation/remove.py
 create mode 100644 src/raven_python/manipulation/simplify.py
 create mode 100644 src/raven_python/manipulation/transfer.py
 create mode 100644 src/raven_python/manipulation/transport.py
 create mode 100644 tests/test_change_grrules.py
 create mode 100644 tests/test_manipulation_add.py
 create mode 100644 tests/test_manipulation_change.py
 create mode 100644 tests/test_manipulation_compartments.py
 create mode 100644 tests/test_manipulation_expand.py
 create mode 100644 tests/test_manipulation_irreversible.py
 create mode 100644 tests/test_manipulation_merge.py
 create mode 100644 tests/test_manipulation_remove.py
 create mode 100644 tests/test_manipulation_simplify.py
 create mode 100644 tests/test_manipulation_transfer.py
 create mode 100644 tests/test_manipulation_transport.py
 create mode 100644 tests/test_parameters.py

diff --git a/src/raven_python/manipulation/__init__.py b/src/raven_python/manipulation/__init__.py
new file mode 100644
index 0000000..074c36f
--- /dev/null
+++ b/src/raven_python/manipulation/__init__.py
@@ -0,0 +1,36 @@
+"""Generic cobra.Model structural transforms that cobrapy does not cover cleanly:
+reaction building from equations, batch GPR / bound changes, irreversibility splitting,
+isozyme expansion, compartment merge / copy, and model merging by name."""
+from .add import add_reactions_from_equations
+from .change import change_gene_reaction_rules, change_reaction_equations
+from .expand import expand_model
+from .irreversible import convert_to_irreversible
+from .merge import merge_models
+from .parameters import set_variance_bounds
+from .remove import remove_genes, remove_metabolites
+from .simplify import (
+    constrain_reversible_reactions,
+    group_linear_reactions,
+    remove_dead_end_reactions,
+    remove_duplicate_reactions,
+)
+from .transfer import add_reactions_from_model
+from .transport import add_transport_reactions
+
+__all__ = [
+    "add_reactions_from_equations",
+    "add_reactions_from_model",
+    "add_transport_reactions",
+    "change_gene_reaction_rules",
+    "change_reaction_equations",
+    "constrain_reversible_reactions",
+    "convert_to_irreversible",
+    "expand_model",
+    "group_linear_reactions",
+    "merge_models",
+    "remove_dead_end_reactions",
+    "remove_duplicate_reactions",
+    "remove_genes",
+    "remove_metabolites",
+    "set_variance_bounds",
+]
diff --git a/src/raven_python/manipulation/add.py b/src/raven_python/manipulation/add.py
new file mode 100644
index 0000000..3842297
--- /dev/null
+++ b/src/raven_python/manipulation/add.py
@@ -0,0 +1,345 @@
+"""Add reactions to a model from equation strings.
+
+Most of the equivalent MATLAB code is struct-of-arrays bookkeeping (padding parallel
+``rxnNames`` / ``lb`` / ``ub`` / ``grRules`` / ... fields) that does not exist in
+cobra, where each ``Reaction`` carries its own attributes. cobra also already
+covers a large part of the *behaviour*:
+
+* ``Reaction.build_reaction_from_string`` parses equation strings, coefficients,
+  and reversibility arrows (``<=>``, ``-->``, ``=>``) and creates unknown
+  metabolites — but only matching metabolites **by ID**, and it leaves new
+  metabolites with ``compartment=None``.
+* assigning ``reaction.gene_reaction_rule`` auto-creates ``Gene`` objects.
+
+So this port keeps only the parts cobra lacks:
+
+* **name-based matching** — interpret equation tokens as metabolite *names*
+  (RAVEN eqnType 2) or as ``name[comp]`` (eqnType 3), not just IDs;
+* **correct compartment** assignment for newly created metabolites;
+* **strict policies** — optionally *error* (rather than silently create) on
+  unknown metabolites or genes, and always error on a duplicate reaction ID
+  (cobra silently ignores those).
+
+Instead of RAVEN's ``eqnType`` integer (1/2/3) the matching mode is a readable
+keyword: ``mets_by="id"`` or ``mets_by="name"``, with ``name[comp]`` recognised
+automatically. See IMPROVEMENTS.md (A-series) for the rationale.
+"""
+from __future__ import annotations
+
+import re
+import warnings
+from collections import OrderedDict
+from collections.abc import Mapping, Sequence
+
+import cobra
+from cobra import Metabolite, Reaction
+from cobra.core.gene import GPR
+
+from raven_python.utils.parse import parse_name_comp
+
+# Reversibility arrows. ``<=>`` must be tried before ``=>`` (it contains it).
+_REVERSIBLE_ARROWS = ("<=>",)
+_FORWARD_ARROWS = ("-->", "->", "=>")
+
+
+def _split_equation(equation: str) -> tuple[str, str, bool]:
+    """Split an equation into (lhs, rhs, reversible) on its arrow."""
+    for arrow in _REVERSIBLE_ARROWS:
+        if arrow in equation:
+            lhs, rhs = equation.split(arrow, 1)
+            return lhs, rhs, True
+    for arrow in _FORWARD_ARROWS:
+        if arrow in equation:
+            lhs, rhs = equation.split(arrow, 1)
+            return lhs, rhs, False
+    raise ValueError(f"No reaction arrow (<=>, -->, =>) found in equation: {equation!r}")
+
+
+def _parse_side(side: str) -> list[tuple[float, str, str | None]]:
+    """Parse one side of an equation into ``[(coefficient, token, fallback), ...]``.
+
+    The ``fallback`` slot is for the ambiguous ``"<number> <rest>"`` shape: when
+    matching by name, ``"2 oxoglutarate"`` could be either ``coeff=2, name="oxoglutarate"``
+    or ``coeff=1, name="2 oxoglutarate"`` (a real chemistry name). We return the
+    coefficient-split form as the primary and the full term as the fallback; the
+    resolver picks whichever matches an existing metabolite. Pure-number heads
+    with no name (``"2"``) and pure-name terms (``"glucose"``) have no fallback.
+    """
+    terms: list[tuple[float, str, str | None]] = []
+    for raw in side.split(" + "):
+        term = raw.strip()
+        if not term:
+            continue
+        head, _, tail = term.partition(" ")
+        try:
+            coeff = float(head)
+            token = tail.strip()
+        except ValueError:
+            coeff, token = 1.0, term
+            fallback = None
+        else:
+            # Coefficient-split succeeded. Keep the full term as a fallback when
+            # the tail is non-empty so name-resolution can re-try it as one token.
+            fallback = term if token else None
+        if not token:
+            raise ValueError(f"Missing metabolite after coefficient in term: {raw!r}")
+        terms.append((coeff, token, fallback))
+    return terms
+
+
+def _new_met_id(model: cobra.Model, prefix: str) -> str:
+    """Next free ``<prefix><int>`` metabolite ID (RAVEN m1, m2, ... scheme)."""
+    pattern = re.compile(rf"^{re.escape(prefix)}(\d+)$")
+    used = [int(m.group(1)) for met in model.metabolites if (m := pattern.match(met.id))]
+    n = max(used) + 1 if used else 1
+    while f"{prefix}{n}" in model.metabolites:
+        n += 1
+    return f"{prefix}{n}"
+
+
+def _try_existing(
+    model: cobra.Model, token: str, *, mets_by: str, compartment: str | None
+) -> Metabolite | None:
+    """Look up ``token`` as an existing metabolite (no creation, no side effects).
+
+    Returns the matching metabolite or ``None``. Used by ``_stoichiometry`` to
+    disambiguate the ``"<number> <rest>"`` shape: if a metabolite whose *name*
+    (or id) literally contains a leading number exists, prefer it over splitting
+    the number off as a coefficient.
+    """
+    name, comp = parse_name_comp(token)
+    if mets_by == "id" and comp is None:
+        return model.metabolites.get_by_id(token) if token in model.metabolites else None
+    target_comp = comp if comp is not None else compartment
+    if target_comp is None:
+        return None
+    for met in model.metabolites:
+        if met.name == name and met.compartment == target_comp:
+            return met
+    return None
+
+
+def _resolve_metabolite(
+    model: cobra.Model,
+    token: str,
+    *,
+    mets_by: str,
+    compartment: str | None,
+    allow_new_mets: bool,
+    new_met_prefix: str,
+) -> Metabolite:
+    """Resolve an equation token to an existing or newly created Metabolite."""
+    name, comp = parse_name_comp(token)
+
+    if mets_by == "id" and comp is None:
+        # token is a metabolite ID
+        if token in model.metabolites:
+            return model.metabolites.get_by_id(token)
+        if not allow_new_mets:
+            raise ValueError(
+                f"Unknown metabolite ID {token!r}; pass allow_new_mets=True to create it."
+            )
+        if compartment is None:
+            raise ValueError(
+                f"Cannot create metabolite {token!r}: no compartment given."
+            )
+        _warn_unknown_compartment(model, compartment, token)
+        met = Metabolite(token, compartment=compartment)
+        model.add_metabolites([met])
+        return met
+
+    # name-based (mets_by="name") or explicit name[comp]
+    target_comp = comp if comp is not None else compartment
+    if target_comp is None:
+        raise ValueError(
+            f"Metabolite {token!r} matched by name needs a compartment; "
+            "pass compartment=... or use the name[comp] syntax."
+        )
+    if comp is not None and target_comp not in model.compartments and not allow_new_mets:
+        raise ValueError(f"Compartment {target_comp!r} is not in the model.")
+
+    matches = [
+        met
+        for met in model.metabolites
+        if met.name == name and met.compartment == target_comp
+    ]
+    if matches:
+        return matches[0]
+    if not allow_new_mets:
+        raise ValueError(
+            f"No metabolite named {name!r} in compartment {target_comp!r}; "
+            "pass allow_new_mets=True to create it."
+        )
+    _warn_unknown_compartment(model, target_comp, name)
+    met = Metabolite(_new_met_id(model, new_met_prefix), name=name, compartment=target_comp)
+    model.add_metabolites([met])
+    return met
+
+
+def _warn_unknown_compartment(model: cobra.Model, compartment: str, identifier: str) -> None:
+    """Warn when a new metabolite would be born into a not-yet-registered compartment.
+
+    Both ``mets_by`` paths previously created the metabolite without validating
+    the compartment, so a typo (``"cyto"`` for ``"c"``) silently produced a
+    one-metabolite ghost compartment. cobra inherits the compartment from the
+    first metabolite assigned to it, so the fix is a warning, not a hard error.
+    """
+    known = set(model.compartments) | set(model._compartments)
+    if compartment not in known:
+        warnings.warn(
+            f"Creating metabolite {identifier!r} in unregistered compartment "
+            f"{compartment!r} (existing: {sorted(known) or 'none'}); "
+            "add the compartment first or check for a typo.",
+            stacklevel=5,
+        )
+
+
+def _stoichiometry(
+    model: cobra.Model,
+    equation: str,
+    *,
+    mets_by: str,
+    compartment: str | None,
+    allow_new_mets: bool,
+    new_met_prefix: str,
+) -> tuple[dict[Metabolite, float], bool]:
+    """Parse an equation into a {Metabolite: net coefficient} dict + reversibility."""
+    lhs, rhs, reversible = _split_equation(equation)
+    coeffs: OrderedDict[Metabolite, float] = OrderedDict()
+    had_terms = False
+    for sign, side in ((-1.0, lhs), (1.0, rhs)):
+        for coeff, token, fallback in _parse_side(side):
+            had_terms = True
+            # "<number> <name>" is ambiguous when the name itself starts with a
+            # number (e.g. "2 oxoglutarate"). Prefer the full-term interpretation
+            # when it matches an existing metabolite — otherwise fall through to
+            # the coefficient-split form.
+            met = None
+            if fallback is not None:
+                met = _try_existing(
+                    model, fallback, mets_by=mets_by, compartment=compartment
+                )
+                if met is not None:
+                    coeff = 1.0
+            if met is None:
+                met = _resolve_metabolite(
+                    model,
+                    token,
+                    mets_by=mets_by,
+                    compartment=compartment,
+                    allow_new_mets=allow_new_mets,
+                    new_met_prefix=new_met_prefix,
+                )
+            coeffs[met] = coeffs.get(met, 0.0) + sign * coeff
+    # Drop metabolites that net to zero (present as both substrate and product).
+    coeffs = OrderedDict((met, c) for met, c in coeffs.items() if c != 0.0)
+    if had_terms and not coeffs:
+        warnings.warn(
+            f"Equation {equation!r} has no net metabolites (all terms cancelled); "
+            "the reaction will be added with empty stoichiometry.",
+            stacklevel=4,
+        )
+    return dict(coeffs), reversible
+
+
+def add_reactions_from_equations(
+    model: cobra.Model,
+    reactions: Sequence[Mapping],
+    *,
+    mets_by: str = "id",
+    compartment: str | None = None,
+    allow_new_mets: bool = True,
+    allow_new_genes: bool = True,
+    new_met_prefix: str = "m",
+) -> list[Reaction]:
+    """Add reactions defined by equation strings, matching mets by ID or name.
+    Parameters
+    ----------
+    model
+        Target ``cobra.Model``, mutated in place.
+    reactions
+        Sequence of mappings, one per reaction. Recognised keys:
+
+        * ``id`` (**required**) — reaction ID; must not already exist.
+        * ``equation`` (**required**) — e.g. ``"atp_c + h2o_c <=> adp_c + pi_c"``.
+          Use ``<=>`` for reversible, ``-->``/``->``/``=>`` for irreversible.
+        * ``name`` — reaction name.
+        * ``bounds`` — ``(lower, upper)`` tuple; overrides the arrow.
+        * ``gene_reaction_rule`` — GPR string.
+        * ``subsystem`` — subsystem name.
+    mets_by
+        How bare equation tokens (without ``[comp]``) are matched:
+        ``"id"`` (RAVEN eqnType 1) or ``"name"`` (eqnType 2). A ``name[comp]``
+        token (eqnType 3) is always matched by name + compartment.
+    compartment
+        Default compartment for new metabolites and for name-matched tokens
+        without an explicit ``[comp]``.
+    allow_new_mets
+        If True (default), create metabolites not found. New metabolites get
+        ``compartment`` (id mode) or an auto ID ``m1``, ``m2``, ... (name mode).
+        If False, an unknown metabolite raises.
+    allow_new_genes
+        If True (default), genes in a GPR are auto-created by cobra. If False,
+        a GPR referencing a gene not already in the model raises.
+    new_met_prefix
+        Prefix for auto-generated metabolite IDs in name mode (default ``"m"``).
+
+    Returns
+    -------
+    list of cobra.Reaction
+        The reactions added, in input order.
+    """
+    if mets_by not in ("id", "name"):
+        raise ValueError(f"mets_by must be 'id' or 'name', got {mets_by!r}")
+
+    known_genes = {gene.id for gene in model.genes}
+    added: list[Reaction] = []
+
+    for spec in reactions:
+        if "id" not in spec:
+            raise ValueError(f"Reaction spec missing required 'id': {spec!r}")
+        rxn_id = spec["id"]
+        if rxn_id in model.reactions:
+            raise ValueError(
+                f"Reaction {rxn_id!r} already exists; use changeRxns or remove it first."
+            )
+        if "equation" not in spec:
+            raise ValueError(f"Reaction {rxn_id!r} spec missing required 'equation'.")
+
+        coeffs, reversible = _stoichiometry(
+            model,
+            spec["equation"],
+            mets_by=mets_by,
+            compartment=compartment,
+            allow_new_mets=allow_new_mets,
+            new_met_prefix=new_met_prefix,
+        )
+
+        rxn = Reaction(rxn_id, name=spec.get("name", ""))
+        if "bounds" in spec:
+            rxn.bounds = tuple(spec["bounds"])
+        else:
+            config = cobra.Configuration()
+            lower = config.lower_bound if reversible else 0.0
+            rxn.bounds = (lower, config.upper_bound)
+        if "subsystem" in spec:
+            rxn.subsystem = spec["subsystem"]
+
+        model.add_reactions([rxn])
+        rxn.add_metabolites(coeffs)
+
+        rule = spec.get("gene_reaction_rule", "")
+        if rule:
+            if not allow_new_genes:
+                missing = sorted(set(GPR.from_string(rule).genes) - known_genes)
+                if missing:
+                    raise ValueError(
+                        f"Reaction {rxn_id!r} references genes not in the model: "
+                        f"{missing}. Set allow_new_genes=True or add them first."
+                    )
+            rxn.gene_reaction_rule = rule
+            known_genes.update(gene.id for gene in rxn.genes)
+
+        added.append(rxn)
+
+    return added
diff --git a/src/raven_python/manipulation/change.py b/src/raven_python/manipulation/change.py
new file mode 100644
index 0000000..78612ba
--- /dev/null
+++ b/src/raven_python/manipulation/change.py
@@ -0,0 +1,125 @@
+"""Change the stoichiometry of existing reactions from equation strings.
+
+Editing the same ``Reaction`` object changes only its stoichiometry — its id, name,
+bounds, GPR, subsystem, and position are preserved automatically by cobra.
+
+So this port simply re-parses the equation (reusing the same metabolite
+matching as :func:`~raven_python.manipulation.add.add_reactions_from_equations`,
+including name and ``name[comp]`` modes that cobra lacks) and swaps the
+metabolites in place.
+
+Like RAVEN, **bounds are left unchanged** even if the new equation's arrow
+implies a different reversibility — use a bounds setter for that.
+"""
+from __future__ import annotations
+
+from collections.abc import Mapping
+
+import cobra
+from cobra import Reaction
+
+from raven_python.manipulation.add import _stoichiometry
+
+__all__ = ["change_reaction_equations", "change_gene_reaction_rules"]
+
+
+def change_reaction_equations(
+    model: cobra.Model,
+    equations: Mapping[str, str],
+    *,
+    mets_by: str = "id",
+    compartment: str | None = None,
+    allow_new_mets: bool = True,
+    new_met_prefix: str = "m",
+) -> list[Reaction]:
+    """Replace the stoichiometry of existing reactions.
+    Parameters
+    ----------
+    model
+        Target ``cobra.Model``, mutated in place.
+    equations
+        Mapping of ``reaction_id -> equation string``. Every ID must already
+        exist in the model. Equation syntax is identical to
+        :func:`~raven_python.manipulation.add.add_reactions_from_equations`.
+    mets_by, compartment, allow_new_mets, new_met_prefix
+        Metabolite-matching options, as in ``add_reactions_from_equations``.
+
+    Returns
+    -------
+    list of cobra.Reaction
+        The reactions changed, in input order.
+
+    Notes
+    -----
+    Bounds are **not** modified, matching RAVEN. Changing an equation from
+    ``-->`` to ``<=>`` does not by itself make the reaction reversible; adjust
+    the bounds separately.
+    """
+    if mets_by not in ("id", "name"):
+        raise ValueError(f"mets_by must be 'id' or 'name', got {mets_by!r}")
+
+    changed: list[Reaction] = []
+    for rxn_id, equation in equations.items():
+        if rxn_id not in model.reactions:
+            raise ValueError(f"Reaction {rxn_id!r} not found in the model.")
+        rxn = model.reactions.get_by_id(rxn_id)
+
+        coeffs, _reversible = _stoichiometry(
+            model,
+            equation,
+            mets_by=mets_by,
+            compartment=compartment,
+            allow_new_mets=allow_new_mets,
+            new_met_prefix=new_met_prefix,
+        )
+
+        rxn.subtract_metabolites(dict(rxn.metabolites), combine=True)
+        rxn.add_metabolites(coeffs)
+        changed.append(rxn)
+
+    return changed
+
+
+def change_gene_reaction_rules(
+    model: cobra.Model,
+    rules: Mapping[str, str],
+    *,
+    replace: bool = True,
+) -> list[Reaction]:
+    """Set or append gene-reaction rules on existing reactions.
+    cobra already does the heavy lifting on assignment to
+    ``reaction.gene_reaction_rule``: it auto-creates any new ``Gene`` objects and
+    normalises the rule. So the value here is batching plus RAVEN's ``replace``
+    option to **append** rather than overwrite.
+
+    Parameters
+    ----------
+    model
+        Target ``cobra.Model``, mutated in place.
+    rules
+        Mapping of ``reaction_id -> GPR string``. Every ID must already exist.
+    replace
+        If True (default), overwrite the existing GPR. If False, append the new
+        rule as an isozyme: ``(old) or (new)`` (just ``new`` if the reaction had
+        no GPR).
+
+    Returns
+    -------
+    list of cobra.Reaction
+        The reactions changed, in input order.
+    """
+    changed: list[Reaction] = []
+    for rxn_id, rule in rules.items():
+        if rxn_id not in model.reactions:
+            raise ValueError(f"Reaction {rxn_id!r} not found in the model.")
+        rxn = model.reactions.get_by_id(rxn_id)
+
+        if replace or not rxn.gene_reaction_rule:
+            new_rule = rule
+        else:
+            new_rule = f"({rxn.gene_reaction_rule}) or ({rule})"
+
+        rxn.gene_reaction_rule = new_rule  # cobra creates genes + normalises
+        changed.append(rxn)
+
+    return changed
diff --git a/src/raven_python/manipulation/compartments.py b/src/raven_python/manipulation/compartments.py
new file mode 100644
index 0000000..091d196
--- /dev/null
+++ b/src/raven_python/manipulation/compartments.py
@@ -0,0 +1,196 @@
+"""Compartment manipulation — merge all compartments into one, or copy reactions to a
+new compartment (ports of RAVEN's ``mergeCompartments`` and ``copyToComps``).
+
+Both functions are useful **independently of** :func:`raven_python.localization.predict_localization`:
+``merge_compartments`` flattens a multi-compartment model for a simplified analysis
+(e.g. checking whether the network can in principle make a metabolite, with no
+compartment topology in the way); ``copy_to_compartment`` is a building block for
+constructing dual-localised pathways. cobra has no equivalents.
+"""
+from __future__ import annotations
+
+from collections.abc import Iterable
+
+import cobra
+
+# Compartments produced by merge_compartments (RAVEN uses 's' for "system").
+_MERGED_COMPARTMENT = "s"
+
+
+def merge_compartments(
+    model: cobra.Model,
+    *,
+    merged_id: str = _MERGED_COMPARTMENT,
+    merged_name: str = "system",
+    drop_single_metabolite_reactions: bool = True,
+    deduplicate_reactions: bool = True,
+) -> tuple[cobra.Model, list[str], list[str]]:
+    """Merge every metabolite of ``model`` into one ``merged_id`` compartment.
+
+    Returns ``(model_copy, deleted_single_met_reactions, deduplicated_reactions)``. The
+    returned model is a deep copy of the input. Use cases:
+
+    * Check whether the network can produce/consume a metabolite at all (compartment
+      topology is often what makes a model look blocked).
+    * Simplify a model for visualisation or an analysis that doesn't care about
+      compartments.
+    * As a pre-step for localisation when the user does want RAVEN's
+      "start from scratch" workflow (call :func:`merge_compartments` then
+      :func:`raven_python.localization.predict_localization` with the full reaction list).
+
+    Metabolites that already share a base id (e.g. ``glc__D_c`` and ``glc__D_e`` both
+    map to ``glc__D``) collapse into one entity in the merged compartment; their
+    stoichiometric contributions are summed per reaction. Reactions that end up with
+    only one metabolite (e.g. ``A[c] → A[m]`` becomes ``A → A`` = nothing) are deleted
+    by default (RAVEN's ``deleteRxnsWithOneMet``). Reactions that become identical
+    after merging are deduplicated (one survives).
+    """
+    out = model.copy()
+
+    # 1. For each metabolite, derive a base id (strip the trailing _<compartment>).
+    #    Two mets in different compartments sharing the base id collapse to one.
+    new_to_old: dict[str, list[cobra.Metabolite]] = {}
+    for m in list(out.metabolites):
+        base = _base_id(m)
+        new_to_old.setdefault(base, []).append(m)
+
+    # 2. Build the merged metabolites and rewrite reactions.
+    canonical: dict[str, cobra.Metabolite] = {}
+    for base, mets in new_to_old.items():
+        proto = mets[0]
+        new_met = cobra.Metabolite(base, name=proto.name, compartment=merged_id,
+                                    formula=proto.formula, charge=proto.charge)
+        new_met.notes = dict(proto.notes or {})
+        canonical[base] = new_met
+
+    # Rewrite all reactions: replace each metabolite with its canonical, summing
+    # coefficients where multiple original mets collapse to one.
+    rewritten: dict[str, dict[str, float]] = {}
+    for r in list(out.reactions):
+        new_stoich: dict[cobra.Metabolite, float] = {}
+        for m, coeff in list(r.metabolites.items()):
+            canon = canonical[_base_id(m)]
+            new_stoich[canon] = new_stoich.get(canon, 0.0) + coeff
+        # Drop zero net coefficients (substrate + product of the same base met cancel).
+        new_stoich = {m: c for m, c in new_stoich.items() if c != 0.0}
+        rewritten[r.id] = {m.id: c for m, c in new_stoich.items()}
+
+    # Now build a fresh model with the canonical mets + rewritten reactions; the
+    # cobra in-place rewrite would require careful constraint surgery, so a clean
+    # rebuild is simpler and less error-prone.
+    merged = cobra.Model(out.id or "merged")
+    merged.compartments = {merged_id: merged_name}
+    merged.add_metabolites(list(canonical.values()))
+    deleted_single: list[str] = []
+    deduplicated: list[str] = []
+    seen_signatures: dict[tuple, str] = {}
+    keep_reactions: list[cobra.Reaction] = []
+    for r in out.reactions:
+        stoich = rewritten[r.id]
+        if drop_single_metabolite_reactions and len(stoich) <= 1:
+            deleted_single.append(r.id)
+            continue
+        if not stoich:  # everything cancelled
+            deleted_single.append(r.id)
+            continue
+        sig = (frozenset(stoich.items()), bool(r.lower_bound < 0), bool(r.upper_bound > 0))
+        if deduplicate_reactions and sig in seen_signatures:
+            deduplicated.append(r.id)
+            continue
+        seen_signatures[sig] = r.id
+        new_r = cobra.Reaction(r.id, name=r.name, lower_bound=r.lower_bound,
+                                upper_bound=r.upper_bound)
+        new_r.add_metabolites({merged.metabolites.get_by_id(mid): c for mid, c in stoich.items()})
+        new_r.gene_reaction_rule = r.gene_reaction_rule
+        if r.subsystem:
+            new_r.subsystem = r.subsystem
+        new_r.notes = dict(r.notes or {})
+        keep_reactions.append(new_r)
+    merged.add_reactions(keep_reactions)
+    return merged, deleted_single, deduplicated
+
+
+def copy_to_compartment(
+    model: cobra.Model,
+    reactions: Iterable[str],
+    target_compartment: str,
+    *,
+    target_compartment_name: str | None = None,
+    delete_original: bool = False,
+    id_suffix: str | None = None,
+) -> tuple[cobra.Model, list[str], list[str]]:
+    """Copy a set of reactions into ``target_compartment``. RAVEN's ``copyToComps``.
+
+    Returns ``(model_copy, new_reaction_ids, new_metabolite_ids)``. Use cases:
+
+    * Build a dual-localised pathway (e.g. duplicate glycolysis into a peroxisome).
+    * Mirror a curated subsystem into an additional compartment as a draft to refine.
+    * Set up the input for a flux comparison between alternate compartmentalisations.
+
+    Each copied reaction is given the id ``"<orig_id>_<id_suffix>"`` (default
+    ``id_suffix=target_compartment``); each metabolite it touches is mapped to (or
+    created in) ``target_compartment`` with the same suffix convention. ``delete_original=True``
+    moves the reactions instead of copying.
+    """
+    out = model.copy()
+    suffix = id_suffix if id_suffix is not None else target_compartment
+    if target_compartment not in out.compartments:
+        out.compartments = {**out.compartments,
+                             target_compartment: target_compartment_name or target_compartment}
+
+    preexisting_met_ids = {x.id for x in out.metabolites}
+    new_rxn_ids: list[str] = []
+    for rid in list(reactions):
+        if rid not in out.reactions:
+            raise ValueError(f"reaction {rid!r} not in model")
+        src = out.reactions.get_by_id(rid)
+        new_id = f"{rid}_{suffix}"
+        if new_id in out.reactions:
+            continue  # already copied; idempotent
+        new_stoich: dict[cobra.Metabolite, float] = {}
+        for m, coeff in src.metabolites.items():
+            target_met = _met_in_compartment(out, m, target_compartment, suffix=suffix)
+            new_stoich[target_met] = coeff
+        new_r = cobra.Reaction(new_id, name=src.name,
+                                lower_bound=src.lower_bound, upper_bound=src.upper_bound)
+        new_r.add_metabolites(new_stoich)
+        new_r.gene_reaction_rule = src.gene_reaction_rule
+        if src.subsystem:
+            new_r.subsystem = src.subsystem
+        new_r.notes = dict(src.notes or {})
+        out.add_reactions([new_r])
+        new_rxn_ids.append(new_id)
+        if delete_original:
+            out.remove_reactions([src.id], remove_orphans=False)
+
+    new_met_ids = [m.id for m in out.metabolites if m.id not in preexisting_met_ids]
+    return out, new_rxn_ids, new_met_ids
+
+
+# ----------------------------------------------------------------- helpers
+
+def _base_id(m: cobra.Metabolite) -> str:
+    """Strip the trailing ``_<compartment>`` suffix from a metabolite id (if present)."""
+    if m.compartment and m.id.endswith(f"_{m.compartment}"):
+        return m.id[: -(len(m.compartment) + 1)]
+    return m.id
+
+
+def _met_in_compartment(model: cobra.Model, source: cobra.Metabolite,
+                        compartment: str, *, suffix: str | None = None) -> cobra.Metabolite:
+    """Return (creating if needed) the copy of ``source`` in ``compartment``.
+
+    The new metabolite id is ``"<base>_<suffix>"`` (default ``suffix=compartment``).
+    Already-existing copies are reused.
+    """
+    if source.compartment == compartment:
+        return source
+    base = _base_id(source)
+    new_id = f"{base}_{suffix if suffix is not None else compartment}"
+    if new_id in model.metabolites:
+        return model.metabolites.get_by_id(new_id)
+    new_met = cobra.Metabolite(new_id, name=source.name, compartment=compartment,
+                                formula=source.formula, charge=source.charge)
+    new_met.notes = dict(source.notes or {})
+    model.add_metabolites([new_met])
+    return new_met
diff --git a/src/raven_python/manipulation/expand.py b/src/raven_python/manipulation/expand.py
new file mode 100644
index 0000000..246f3b9
--- /dev/null
+++ b/src/raven_python/manipulation/expand.py
@@ -0,0 +1,124 @@
+"""Expand reactions with isozymes into one reaction per isozyme.
+
+Operates on cobra's GPR AST, so the model stays a plain ``cobra.Model`` throughout.
+
+Provenance: this implementation was first written for geckopy
+(``geckopy/ec_model/pipeline/expand.py``, where it backed makeEcModel stage 5)
+and is adopted here as its canonical home; geckopy will import it from raven_python
+once raven_python is published.
+
+MATLAB-COMPAT: GECKO MATLAB and RAVEN ``expandModel.m`` use string manipulation
+on grRules to detect and split isozymes. raven_python uses cobrapy's GPR AST
+instead. Output should be equivalent for any well-formed GPR; cases that differ
+are likely malformed GPR strings that the AST flags as invalid.
+"""
+from __future__ import annotations
+
+import ast
+import copy
+
+import cobra
+from cobra.core.gene import GPR
+
+
+def _gpr_to_dnf(gpr: GPR) -> list[list[str]]:
+    """Convert a GPR to disjunctive normal form (list of AND-clauses).
+
+    An empty GPR yields an empty list. A single clause (no OR anywhere)
+    yields a list of length 1. OR-of-ANDs yields one sublist per
+    disjunct, each containing the gene names ANDed together.
+
+    Handles distributivity: ``g1 and (g2 or g3)`` becomes
+    ``[[g1, g2], [g1, g3]]``.
+    """
+    if gpr is None or gpr.body is None:
+        return []
+    return _node_to_dnf(gpr.body)
+
+
+def _node_to_dnf(node) -> list[list[str]]:
+    """Recursive helper. Returns DNF as list of AND-clauses."""
+    if isinstance(node, ast.Name):
+        return [[node.id]]
+    if isinstance(node, ast.BoolOp):
+        if isinstance(node.op, ast.Or):
+            result: list[list[str]] = []
+            for child in node.values:
+                result.extend(_node_to_dnf(child))
+            return result
+        if isinstance(node.op, ast.And):
+            clauses: list[list[str]] = [[]]
+            for child in node.values:
+                child_dnf = _node_to_dnf(child)
+                new_clauses: list[list[str]] = []
+                for existing in clauses:
+                    for extra in child_dnf:
+                        new_clauses.append(existing + extra)
+                clauses = new_clauses
+            return clauses
+    raise ValueError(f"Unexpected GPR node type: {type(node).__name__}")
+
+
+def expand_model(model: cobra.Model) -> list[str]:
+    """Split reactions with isozymes (OR in GPR) into one reaction per isozyme.
+    For each reaction whose GPR contains at least one OR, the reaction
+    is removed and replaced by one copy per disjunctive clause. The new
+    reactions get ID suffix ``_EXP_1``, ``_EXP_2``, etc. All other
+    fields (stoichiometry, bounds, name, subsystem) are copied verbatim;
+    only the GPR is simplified to the single AND-clause for that
+    isozyme.
+
+    Reactions with no GPR, or with a GPR that has no OR, are left
+    untouched.
+
+    Parameters
+    ----------
+    model
+        A cobra.Model, mutated in place.
+
+    Returns
+    -------
+    list of str
+        Sorted IDs of newly added expanded reactions (those with
+        ``_EXP_N`` suffixes). The original reactions that were split
+        are no longer in the model.
+    """
+    expansions: list[tuple[cobra.Reaction, list[list[str]]]] = []
+
+    for rxn in model.reactions:
+        if not rxn.gene_reaction_rule:
+            continue
+        clauses = _gpr_to_dnf(rxn.gpr)
+        if len(clauses) <= 1:
+            continue
+        expansions.append((rxn, clauses))
+
+    added_ids: list[str] = []
+    for original_rxn, clauses in expansions:
+        new_rxns: list[cobra.Reaction] = []
+        for i, clause in enumerate(clauses, start=1):
+            new_rxn = cobra.Reaction(
+                id=f"{original_rxn.id}_EXP_{i}",
+                name=original_rxn.name,
+            )
+            new_rxn.lower_bound = original_rxn.lower_bound
+            new_rxn.upper_bound = original_rxn.upper_bound
+            new_rxn.add_metabolites(dict(original_rxn.metabolites.items()))
+            new_rxn.subsystem = original_rxn.subsystem
+            new_rxn.gene_reaction_rule = " and ".join(clause)
+            # Propagate per-reaction metadata (notably ec-code / annotations)
+            # so downstream functions see the same annotations on expanded
+            # reactions as on the original. Deep-copy so siblings are independent.
+            new_rxn.annotation = copy.deepcopy(original_rxn.annotation)
+            new_rxn.notes = copy.deepcopy(original_rxn.notes)
+            new_rxns.append(new_rxn)
+
+        obj_coeff = original_rxn.objective_coefficient
+        model.remove_reactions([original_rxn])
+        model.add_reactions(new_rxns)
+        if obj_coeff:  # keep the original in the objective — sum over its isozyme copies
+            for new_rxn in new_rxns:
+                new_rxn.objective_coefficient = obj_coeff
+        added_ids.extend(r.id for r in new_rxns)
+
+    return sorted(added_ids)
diff --git a/src/raven_python/manipulation/irreversible.py b/src/raven_python/manipulation/irreversible.py
new file mode 100644
index 0000000..3f64a68
--- /dev/null
+++ b/src/raven_python/manipulation/irreversible.py
@@ -0,0 +1,72 @@
+"""Convert reversible reactions to an irreversible (forward + reverse) form.
+
+cobrapy's own ``convert_to_irreversible`` was removed, so this is a genuine
+implementation rather than a wrapper.
+
+Provenance: first written for geckopy
+(``geckopy/ec_model/pipeline/preprocess.py``, makeEcModel stage 4, tagged
+"RAVENpy candidate") and adopted here as its canonical home; geckopy will
+import it from raven_python once raven_python is published.
+"""
+from __future__ import annotations
+
+import cobra
+
+
+def convert_to_irreversible(model: cobra.Model) -> list[str]:
+    """Split non-exchange reversible reactions into a forward + reverse pair.
+    For each non-exchange reaction with ``lb < 0``:
+
+    - The original reaction is kept as the forward direction. Its
+      lower bound is clamped to 0.
+    - A new reaction with the same ID plus a ``_REV`` suffix is added,
+      representing the reverse direction. Its stoichiometry is the
+      negation of the original, its bounds are ``(0, -original_lb)``,
+      and it inherits the name (with " (reversible)" appended) and the
+      gene-protein rule of the original.
+
+    Exchange reactions (boundary reactions) are never split, regardless
+    of their bounds, matching MATLAB behavior where exchange reactions
+    are explicitly excluded from ``convertToIrrev``.
+
+    Parameters
+    ----------
+    model
+        A cobra.Model, mutated in place.
+
+    Returns
+    -------
+    list of str
+        Sorted IDs of newly added reverse reactions (the ones ending in
+        ``_REV``). The forward reactions retain their original IDs.
+    """
+    reverse_rxns_to_add: list[cobra.Reaction] = []
+    forward_updates: list[cobra.Reaction] = []
+
+    for rxn in model.reactions:
+        if rxn.boundary:
+            continue
+        if rxn.lower_bound >= 0:
+            continue
+
+        original_lb = rxn.lower_bound
+
+        rev_rxn = cobra.Reaction(
+            id=f"{rxn.id}_REV",
+            name=(f"{rxn.name} (reversible)" if rxn.name else f"{rxn.id}_REV"),
+        )
+        rev_rxn.lower_bound = 0.0
+        rev_rxn.upper_bound = -original_lb
+        rev_rxn.add_metabolites({m: -c for m, c in rxn.metabolites.items()})
+        rev_rxn.gene_reaction_rule = rxn.gene_reaction_rule
+
+        reverse_rxns_to_add.append(rev_rxn)
+        forward_updates.append(rxn)
+
+    for rxn in forward_updates:
+        rxn.lower_bound = 0.0
+
+    if reverse_rxns_to_add:
+        model.add_reactions(reverse_rxns_to_add)
+
+    return sorted(r.id for r in reverse_rxns_to_add)
diff --git a/src/raven_python/manipulation/merge.py b/src/raven_python/manipulation/merge.py
new file mode 100644
index 0000000..bfa1f24
--- /dev/null
+++ b/src/raven_python/manipulation/merge.py
@@ -0,0 +1,146 @@
+"""Merge several models into one.
+
+cobra's ``Model.merge`` is pairwise and matches everything strictly by id; this
+merges **N** models and unifies metabolites by **name[compartment]** (so the same
+compound under different ids in two models becomes one), while adding **all**
+reactions without de-duplication
+(a reaction whose ID already exists is renamed ``id_<sourceid>``). Genes are
+unified by ID. Provenance (which source model each object came from) is recorded
+in ``notes['origin']``.
+
+The bulk of RAVEN's function is struct field-padding and manual S-matrix
+assembly, none of which is needed on ``cobra.Model``.
+"""
+from __future__ import annotations
+
+import copy
+import warnings
+from collections.abc import Iterable
+
+import cobra
+from cobra import Metabolite, Model, Reaction
+
+
+def _unique_id(existing, base: str, suffix: str) -> str:
+    """Return base, or base_suffix (then base_suffix_2, ...) if it collides."""
+    if base not in existing:
+        return base
+    candidate = f"{base}_{suffix}"
+    n = 2
+    while candidate in existing:
+        candidate = f"{base}_{suffix}_{n}"
+        n += 1
+    return candidate
+
+
+def merge_models(
+    models: Iterable[cobra.Model],
+    *,
+    match_by: str = "name",
+    track_origin: bool = True,
+) -> cobra.Model:
+    """Merge models into a single new model.
+    Parameters
+    ----------
+    models
+        The models to merge (two or more). A single model is returned as a copy.
+    match_by
+        How metabolites are unified across models: ``"name"`` (default) treats
+        metabolites with the same *name and compartment* as identical (IDs
+        ignored); ``"id"`` matches by metabolite ID.
+    track_origin
+        If True (default), record the source model's ``id`` in each reaction's,
+        metabolite's, and gene's ``notes['origin']``.
+
+    Returns
+    -------
+    cobra.Model
+        A new merged model (``id="MERGED"``). Reactions are **not** de-duplicated
+        — matching RAVEN, every reaction from every model is kept, with ID
+        collisions renamed ``id_<sourceid>``.
+    """
+    models = list(models)
+    if not models:
+        raise ValueError("merge_models requires at least one model.")
+    if match_by not in ("name", "id"):
+        raise ValueError(f"match_by must be 'name' or 'id', got {match_by!r}")
+    if len(models) == 1:
+        return models[0].copy()
+
+    merged = Model("MERGED")
+    comp_names: dict[str, str] = {}
+    met_lookup: dict = {}  # name/comp or id key -> merged Metabolite
+
+    def met_key(met: Metabolite):
+        return (met.name, met.compartment) if match_by == "name" else met.id
+
+    def ensure_metabolite(src: Metabolite, origin: str) -> Metabolite:
+        key = met_key(src)
+        if key in met_lookup:
+            existing = met_lookup[key]
+            # Two source models can map to the same name[comp] (or id) with
+            # different formula/charge; silently picking the first-seen has
+            # quietly corrupted mass balance in the past. Warn so the caller
+            # sees the conflict.
+            if src.formula and existing.formula and src.formula != existing.formula:
+                warnings.warn(
+                    f"merge_models: metabolite {existing.id!r} (from earlier model) "
+                    f"and {src.id!r} (from {origin!r}) share key {key!r} but "
+                    f"have different formulas ({existing.formula!r} vs {src.formula!r}); "
+                    "keeping the first.",
+                    stacklevel=3,
+                )
+            if (
+                existing.charge is not None
+                and src.charge is not None
+                and existing.charge != src.charge
+            ):
+                warnings.warn(
+                    f"merge_models: metabolite {existing.id!r} (from earlier model) "
+                    f"and {src.id!r} (from {origin!r}) share key {key!r} but "
+                    f"have different charges ({existing.charge} vs {src.charge}); "
+                    "keeping the first.",
+                    stacklevel=3,
+                )
+            return existing
+        new_id = _unique_id(merged.metabolites, src.id, origin)
+        new_met = Metabolite(
+            new_id, name=src.name, compartment=src.compartment,
+            formula=src.formula, charge=src.charge,
+        )
+        new_met.annotation = copy.deepcopy(src.annotation)
+        new_met.notes = copy.deepcopy(src.notes)
+        if track_origin:
+            new_met.notes.setdefault("origin", origin)
+        merged.add_metabolites([new_met])
+        met_lookup[key] = new_met
+        return new_met
+
+    for model in models:
+        origin = model.id or "model"
+        comp_names.update(model.compartments)
+        genes_before = {g.id for g in merged.genes}
+
+        for rxn in model.reactions:
+            new_id = _unique_id(merged.reactions, rxn.id, origin)
+            new_rxn = Reaction(new_id, name=rxn.name)
+            new_rxn.bounds = rxn.bounds
+            new_rxn.subsystem = rxn.subsystem
+            merged.add_reactions([new_rxn])
+            new_rxn.add_metabolites(
+                {ensure_metabolite(m, origin): coef for m, coef in rxn.metabolites.items()}
+            )
+            if rxn.gene_reaction_rule:
+                new_rxn.gene_reaction_rule = rxn.gene_reaction_rule
+            new_rxn.annotation = copy.deepcopy(rxn.annotation)
+            new_rxn.notes = copy.deepcopy(rxn.notes)
+            if track_origin:
+                new_rxn.notes.setdefault("origin", origin)
+
+        if track_origin:
+            for gene in merged.genes:
+                if gene.id not in genes_before:
+                    gene.notes.setdefault("origin", origin)
+
+    merged._compartments.update(comp_names)
+    return merged
diff --git a/src/raven_python/manipulation/parameters.py b/src/raven_python/manipulation/parameters.py
new file mode 100644
index 0000000..f349804
--- /dev/null
+++ b/src/raven_python/manipulation/parameters.py
@@ -0,0 +1,78 @@
+"""Set reaction bounds to a sign-aware ±% variance band around measured values.
+
+Cobra has no idiom for the *variance band* case (e.g. "5 ± 20 %"); the other common
+bound-setting cases are cobra one-liners:
+
+* fixed lb / ub  → ``reaction.lower_bound`` / ``upper_bound`` / ``reaction.bounds``
+* equality       → ``reaction.bounds = (v, v)``
+* objective      → ``model.objective = {reaction: coeff}``
+* unconstrained  → ``reaction.bounds = cobra.Configuration().bounds``
+"""
+from __future__ import annotations
+
+from collections.abc import Iterable, Sequence
+
+import cobra
+from cobra import Reaction
+
+Number = int | float
+
+
+def _resolve(model: cobra.Model, reactions) -> list[Reaction]:
+    if isinstance(reactions, (str, Reaction)):
+        reactions = [reactions]
+    out: list[Reaction] = []
+    for r in reactions:
+        if isinstance(r, Reaction):
+            out.append(r)
+        elif r in model.reactions:
+            out.append(model.reactions.get_by_id(r))
+        else:
+            raise ValueError(f"Reaction {r!r} not found in the model.")
+    return out
+
+
+def _broadcast(value, n: int) -> list[float]:
+    if isinstance(value, (int, float)):
+        return [float(value)] * n
+    vals = [float(v) for v in value]
+    if len(vals) != n:
+        raise ValueError(
+            f"Expected 1 or {n} values to match the reactions, got {len(vals)}."
+        )
+    return vals
+
+
+def set_variance_bounds(
+    model: cobra.Model,
+    reactions: str | Reaction | Iterable,
+    values: Number | Sequence[Number],
+    percent: Number,
+) -> list[Reaction]:
+    """Constrain reactions to a ``±percent/2`` band around measured values.
+
+    For a measured value ``v`` and ``percent`` ``p``, the bounds become
+    ``v * (1 - p/200) .. v * (1 + p/200)`` — i.e. ``percent`` is the *total*
+    width, split half above and half below. For a negative ``v`` the two are
+    swapped so that ``lb <= ub``. E.g. ``percent=5`` gives 97.5 %..102.5 % of ``v``.
+
+    Parameters
+    ----------
+    reactions
+        Reaction IDs or objects.
+    values
+        Measured value per reaction; a scalar is broadcast to all reactions.
+    percent
+        Total band width as a percentage.
+
+    Returns
+    -------
+    list of cobra.Reaction
+        The reactions affected.
+    """
+    rxns = _resolve(model, reactions)
+    half = percent / 200.0
+    for rxn, v in zip(rxns, _broadcast(values, len(rxns)), strict=True):
+        lo, hi = v * (1 - half), v * (1 + half)
+        rxn.bounds = (hi, lo) if v < 0 else (lo, hi)
+    return rxns
diff --git a/src/raven_python/manipulation/remove.py b/src/raven_python/manipulation/remove.py
new file mode 100644
index 0000000..492de36
--- /dev/null
+++ b/src/raven_python/manipulation/remove.py
@@ -0,0 +1,120 @@
+"""Remove metabolites or genes from a model.
+
+For removing *reactions*, use cobra directly:
+``cobra.Model.remove_reactions(reactions, remove_orphans=...)``.
+
+The two functions here delegate the core to cobra and add the cobra-absent behaviour:
+
+* ``remove_metabolites`` — cobra matches metabolites by ID; RAVEN's ``isNames``
+  deletes a metabolite in **every compartment at once** by name. That name
+  resolution is the *sole* reason this wrapper exists (see the note on it).
+* ``remove_genes`` — cobra's ``cobra.manipulation.remove_genes`` already rewrites
+  GPRs through the boolean AST (removing one gene of ``A and B`` empties the
+  rule, of ``A or B`` keeps the other) — exactly RAVEN's intent, without its
+  ``eval``. The gap is RAVEN's default of **constraining** flux-blocked reactions
+  to zero instead of deleting them; exposed as ``blocked_reactions``.
+"""
+from __future__ import annotations
+
+from collections.abc import Iterable
+
+import cobra
+from cobra import Gene, Metabolite
+from cobra.manipulation import remove_genes as _cobra_remove_genes
+
+
+def _as_list(obj) -> list:
+    if isinstance(obj, (str, Metabolite, Gene)):
+        return [obj]
+    return list(obj)
+
+
+def remove_metabolites(
+    model: cobra.Model,
+    metabolites: str | Metabolite | Iterable,
+    *,
+    by_name: bool = False,
+    destructive: bool = False,
+) -> None:
+    """Remove metabolites, optionally matching by name across all compartments.
+
+    Parameters
+    ----------
+    by_name
+        If True, ``metabolites`` are metabolite *names*; every metabolite with a
+        matching name is removed, regardless of compartment (RAVEN ``isNames``).
+        If False, they are IDs/objects, resolved via cobra.
+    destructive
+        Passed to cobra: if True, also remove every reaction the metabolite
+        participates in.
+
+    Note
+    ----
+    With ``by_name=False`` this is just ``model.remove_metabolites`` — the
+    ``by_name`` cross-compartment deletion is the only thing this adds over cobra.
+    """
+    if by_name:
+        wanted = set(_as_list(metabolites))
+        targets = [m for m in model.metabolites if m.name in wanted]
+    else:
+        targets = model.metabolites.get_by_any(_as_list(metabolites))
+    if targets:
+        model.remove_metabolites(targets, destructive=destructive)
+
+
+def remove_genes(
+    model: cobra.Model,
+    genes: str | Gene | Iterable,
+    *,
+    blocked_reactions: str = "remove",
+    remove_orphans: bool = False,
+) -> list[str]:
+    """Remove genes and handle reactions left unable to carry flux.
+
+    GPR rewriting (with correct AND/OR semantics) and gene deletion are done by cobra;
+    this adds a policy for reactions whose GPR becomes empty (no enzyme left):
+
+    * ``"remove"`` — delete them (cobra's default).
+    * ``"constrain"`` — keep them but set bounds to ``(0, 0)``.
+    * ``"keep"`` — leave them with an empty GPR and unchanged bounds.
+
+    ``remove_orphans`` (only meaningful with ``blocked_reactions="remove"``)
+    passes through to cobra: drop metabolites *and* genes orphaned by the removal.
+
+    Returns
+    -------
+    list of str
+        IDs of the reactions that became flux-blocked (had a GPR, now empty).
+    """
+    if blocked_reactions not in ("remove", "constrain", "keep"):
+        raise ValueError(
+            f"blocked_reactions must be 'remove', 'constrain', or 'keep', "
+            f"got {blocked_reactions!r}"
+        )
+
+    # Resolve to gene IDs that are actually in the model (RAVEN filters likewise).
+    requested = [g.id if isinstance(g, Gene) else g for g in _as_list(genes)]
+    present = [gid for gid in requested if gid in model.genes]
+    if not present:
+        return []
+
+    # Reactions touched by these genes that currently have a GPR.
+    affected = set()
+    for gid in present:
+        affected.update(r.id for r in model.genes.get_by_id(gid).reactions)
+    had_gpr = {rid for rid in affected if model.reactions.get_by_id(rid).gene_reaction_rule}
+
+    # cobra rewrites GPRs (AST) and removes the gene objects; we manage reactions.
+    _cobra_remove_genes(model, present, remove_reactions=False)
+
+    blocked = [
+        rid for rid in had_gpr if not model.reactions.get_by_id(rid).gene_reaction_rule
+    ]
+
+    if blocked_reactions == "remove":
+        model.remove_reactions(blocked, remove_orphans=remove_orphans)
+    elif blocked_reactions == "constrain":
+        for rid in blocked:
+            model.reactions.get_by_id(rid).bounds = (0, 0)
+
+    return sorted(blocked)
diff --git a/src/raven_python/manipulation/simplify.py b/src/raven_python/manipulation/simplify.py
new file mode 100644
index 0000000..2deaccd
--- /dev/null
+++ b/src/raven_python/manipulation/simplify.py
@@ -0,0 +1,229 @@
+"""Reduce a model by removing/merging reactions that cannot carry flux.
+
+Four reduction modes that cobra does not cover out of the box:
+``remove_dead_end_reactions`` (reactions whose substrates have no producer),
+``remove_duplicate_reactions``, ``constrain_reversible_reactions`` (tighten bounds
+via FVA), and ``group_linear_reactions`` (lossy fold of unit-stoichiometry chains
+into one reaction; drops gene rules).
+
+Cobra-covered modes that you'd reach for separately:
+
+* No-flux removal → ``cobra.flux_analysis.find_blocked_reactions``.
+* Zero-interval removal → filter reactions with ``bounds == (0, 0)`` then prune.
+"""
+from __future__ import annotations
+
+import math
+from collections.abc import Iterable
+
+import cobra
+from cobra.flux_analysis import flux_variability_analysis
+
+from raven_python.manipulation.irreversible import convert_to_irreversible
+
+
+def _prune_orphan_metabolites(model: cobra.Model) -> list[str]:
+    orphans = [m for m in model.metabolites if not m.reactions]
+    if orphans:
+        model.remove_metabolites(orphans)
+    return [m.id for m in orphans]
+
+
+def _can_produce_and_consume(met) -> tuple[bool, bool]:
+    """Whether the network can both produce and consume ``met`` (given directions)."""
+    produce = consume = False
+    for rxn in met.reactions:
+        coef = rxn.get_coefficient(met)
+        if coef > 0:
+            produce |= rxn.upper_bound > 0
+            consume |= rxn.lower_bound < 0
+        elif coef < 0:
+            consume |= rxn.upper_bound > 0
+            produce |= rxn.lower_bound < 0
+    return produce, consume
+
+
+def remove_dead_end_reactions(
+    model: cobra.Model, *, reserved: Iterable[str] | None = None
+) -> tuple[list[str], list[str]]:
+    """Iteratively remove dead-end reactions and metabolites.
+
+    A metabolite
+    is a dead end if it participates in only one reaction, or if (accounting for
+    reaction directionality) it can only be produced or only consumed — such
+    metabolites cannot carry steady-state flux, so the reactions touching them
+    are removed. Repeats until stable.
+
+    Returns ``(removed_reaction_ids, removed_metabolite_ids)``.
+    """
+    reserved = set(reserved or [])
+    removed_rxns: list[str] = []
+    removed_mets: list[str] = []
+    while True:
+        removed_mets += _prune_orphan_metabolites(model)
+        dead = [
+            m
+            for m in model.metabolites
+            if len(m.reactions) <= 1 or not all(_can_produce_and_consume(m))
+        ]
+        if not dead:
+            break
+        rxns = {r for m in dead for r in m.reactions}
+        to_delete = [r for r in rxns if r.id not in reserved]
+        if not to_delete:
+            break
+        removed_rxns += [r.id for r in to_delete]
+        model.remove_reactions(to_delete)
+    return removed_rxns, removed_mets
+
+
+def _signature(rxn):
+    mets = frozenset((m.id, c) for m, c in rxn.metabolites.items())
+    return (mets, rxn.lower_bound, rxn.upper_bound, rxn.objective_coefficient)
+
+
+def remove_duplicate_reactions(
+    model: cobra.Model, *, reserved: Iterable[str] | None = None
+) -> list[str]:
+    """Remove all-but-one of each set of duplicate reactions.
+
+    Reactions are duplicates when they have identical stoichiometry, bounds, and
+    objective coefficient. One of each set is kept (reserved reactions are never
+    removed). Returns the removed reaction IDs.
+    """
+    reserved = set(reserved or [])
+    groups: dict = {}
+    for rxn in model.reactions:
+        groups.setdefault(_signature(rxn), []).append(rxn)
+
+    removed: list[str] = []
+    for rxns in groups.values():
+        if len(rxns) <= 1:
+            continue
+        keep = rxns[-1]
+        to_remove = [r for r in rxns if r is not keep and r.id not in reserved]
+        if to_remove:
+            removed += [r.id for r in to_remove]
+            model.remove_reactions(to_remove)
+    return removed
+
+
+def constrain_reversible_reactions(
+    model: cobra.Model, *, eps: float = 1e-9
+) -> list[str]:
+    """Constrain reversible reactions that can only carry flux one way.
+
+    Runs FVA on
+    each reversible reaction; if it can only carry forward flux its lower bound
+    is set to 0, and if it can only carry reverse flux it is flipped to a forward
+    reaction (stoichiometry, bounds, and objective negated). Returns the changed
+    reaction IDs.
+    """
+    revs = [r for r in model.reactions if r.lower_bound < 0 < r.upper_bound]
+    if not revs:
+        return []
+    # Infeasible models surface as either OptimizationError (Gurobi/HiGHS) or
+    # NaN-filled ranges (some optlang backends silently). Catch both and raise
+    # a single clear error — the original ``abs(NaN) < eps`` comparison would
+    # have silently no-op'd, letting bogus "all reactions truly reversible"
+    # decisions sneak through.
+    try:
+        fva = flux_variability_analysis(
+            model, reaction_list=revs, fraction_of_optimum=0.0
+        )
+    except Exception as exc:  # noqa: BLE001 - solver-family agnostic
+        raise RuntimeError(
+            "constrain_reversible_reactions: FVA failed — the model is likely "
+            "infeasible at fraction_of_optimum=0. Fix the infeasibility first "
+            "(often a missing exchange or an over-constrained essential). "
+            f"({exc})"
+        ) from exc
+    if fva[["minimum", "maximum"]].isna().any().any():
+        raise RuntimeError(
+            "constrain_reversible_reactions: FVA returned NaN ranges — the "
+            "model is infeasible at fraction_of_optimum=0. Fix the infeasibility "
+            "first (often a missing exchange or an over-constrained essential)."
+        )
+
+    changed: list[str] = []
+    for rxn in revs:
+        lo = fva.at[rxn.id, "minimum"]
+        hi = fva.at[rxn.id, "maximum"]
+        # Guard against ±inf ranges (unbounded objective): treat them as truly
+        # reversible rather than "zero" by the abs(·) < eps check.
+        if math.isinf(lo) or math.isinf(hi):
+            continue
+        min_zero, max_zero = abs(lo) < eps, abs(hi) < eps
+        if min_zero == max_zero:  # both ~0 (blocked) or both nonzero (truly reversible)
+            continue
+        if max_zero:  # only reverse flux → flip to a forward reaction
+            old_lb = rxn.lower_bound
+            rxn.add_metabolites({m: -2 * c for m, c in rxn.metabolites.items()})
+            rxn.bounds = (0.0, -old_lb)
+            rxn.objective_coefficient = -rxn.objective_coefficient
+        else:  # only forward flux
+            rxn.lower_bound = 0.0
+        changed.append(rxn.id)
+    return changed
+
+
+def group_linear_reactions(
+    model: cobra.Model, *, reserved: Iterable[str] | None = None
+) -> None:
+    """Merge linear (single-producer, single-consumer) reaction chains.
+
+    **Lossy**: gene-reaction
+    associations are discarded (RAVEN does the same), since merged reactions have
+    no meaningful combined GPR. The model is first made irreversible, then any
+    metabolite that is produced by exactly one reaction and consumed by exactly
+    one reaction is eliminated by merging the two reactions. Mutates in place.
+    """
+    reserved = set(reserved or [])
+
+    # Lossy: drop all gene information.
+    for rxn in model.reactions:
+        rxn.gene_reaction_rule = ""
+    for gene in list(model.genes):
+        model.genes.remove(gene)
+
+    convert_to_irreversible(model)
+
+    # Worklist of metabolites to (re)consider for merging. Each metabolite
+    # participating in a merge can expose new linear chains in its neighbours,
+    # so we re-enqueue the touched mets rather than restart the whole scan
+    # (the old O(n²·m) restart-after-every-merge loop).
+    pending: list = list(model.metabolites)
+    seen_in_pass: set = set()
+    while pending:
+        met = pending.pop()
+        if met not in model.metabolites:  # removed in a previous merge
+            continue
+        rxns = list(met.reactions)
+        if len(rxns) != 2 or any(r.id in reserved for r in rxns):
+            continue
+        r1, r2 = rxns
+        c1, c2 = r1.get_coefficient(met), r2.get_coefficient(met)
+        if (c1 > 0) == (c2 > 0):  # need one producer and one consumer
+            continue
+        ratio = abs(c1 / c2)
+        new_lb = max(r1.lower_bound, r2.lower_bound / ratio)
+        new_ub = min(r1.upper_bound, r2.upper_bound / ratio)
+        new_obj = r1.objective_coefficient + r2.objective_coefficient * ratio
+        # Re-enqueue every metabolite touched by either side — the merge can
+        # turn neighbours into single-producer/consumer chains in turn.
+        touched = {m for m in r1.metabolites} | {m for m in r2.metabolites}
+        # Merge r2*ratio into r1; the shared metabolite cancels and is dropped.
+        r1.add_metabolites({m: c * ratio for m, c in r2.metabolites.items()})
+        model.remove_reactions([r2])
+        r1.bounds = (new_lb, new_ub)
+        r1.objective_coefficient = new_obj
+        seen_in_pass.clear()
+        for m in touched:
+            if m in model.metabolites and id(m) not in seen_in_pass:
+                seen_in_pass.add(id(m))
+                pending.append(m)
+    # One terminal cleanup pass (cheap; only what remains).
+    empty = [r for r in model.reactions if not r.metabolites]
+    if empty:
+        model.remove_reactions(empty)
+    _prune_orphan_metabolites(model)
diff --git a/src/raven_python/manipulation/transfer.py b/src/raven_python/manipulation/transfer.py
new file mode 100644
index 0000000..b867f02
--- /dev/null
+++ b/src/raven_python/manipulation/transfer.py
@@ -0,0 +1,144 @@
+"""Copy reactions (with their metabolites and genes) from another model.
+
+cobra's ``Model.merge`` / ``add_reactions`` match metabolites strictly by id. This
+transfers a chosen set of reactions from a *source* model into a draft, matching
+metabolites by **name[compartment]** instead — so a compound present in both models
+under different ids is reused rather than duplicated, and only genuinely new
+metabolites are created (copying the source's id, formula,
+charge, and annotation). New genes are auto-created by cobra when the GPR is set.
+This is the post-``getModelFromHomology`` "copy a few more reactions across"
+workflow.
+"""
+from __future__ import annotations
+
+import copy
+from collections.abc import Iterable
+
+import cobra
+from cobra import Metabolite, Reaction
+
+from raven_python.manipulation.add import _new_met_id
+
+
+def _name_comp(met: Metabolite) -> str:
+    return f"{met.name}[{met.compartment}]"
+
+
+def add_reactions_from_model(
+    model: cobra.Model,
+    source_model: cobra.Model,
+    reactions: str | Iterable[str],
+    *,
+    genes: bool | str | Iterable[str] = False,
+    note: str | None = "Added via add_reactions_from_model()",
+    confidence: int | None = None,
+) -> list[Reaction]:
+    """Copy reactions from ``source_model`` into ``model``.
+    Parameters
+    ----------
+    model
+        Draft model to copy into (mutated in place).
+    source_model
+        Model to copy reactions from.
+    reactions
+        Reaction ID(s) in ``source_model``. Reactions already present in
+        ``model`` (by ID) are skipped.
+    genes
+        ``False`` (default): add reactions without GPRs. ``True``: copy each
+        reaction's GPR from the source. A string: use it as the GPR for every
+        added reaction. A list: per-reaction GPRs (matching the reactions that
+        are actually added). New genes are created automatically.
+    note
+        Stored in each added reaction's ``notes['note']`` (set ``None`` to skip).
+    confidence
+        If given, stored in each added reaction's ``notes['confidence_score']``.
+
+    Returns
+    -------
+    list of cobra.Reaction
+        The reactions added, in input order.
+    """
+    rxn_ids = [reactions] if isinstance(reactions, str) else list(reactions)
+    missing = [r for r in rxn_ids if r not in source_model.reactions]
+    if missing:
+        raise ValueError(f"Reactions not found in the source model: {missing}")
+
+    new_ids = [r for r in rxn_ids if r not in model.reactions]
+    if not new_ids:
+        raise ValueError("All reactions are already in the model.")
+    source_rxns = [source_model.reactions.get_by_id(r) for r in new_ids]
+
+    if genes is False:
+        rules = [""] * len(source_rxns)
+    elif genes is True:
+        rules = [r.gene_reaction_rule for r in source_rxns]
+    elif isinstance(genes, str):
+        rules = [genes] * len(source_rxns)
+    else:
+        rules = list(genes)
+        if len(rules) != len(source_rxns):
+            raise ValueError(
+                f"genes list has {len(rules)} rules but {len(source_rxns)} "
+                "reactions are being added."
+            )
+
+    # Match metabolites by name[comp]; create only the genuinely new ones.
+    draft_by_name = {_name_comp(m): m for m in model.metabolites}
+    new_mets: list[Metabolite] = []
+    pending: set[str] = set()
+    # Track ids minted within this batch so two source mets that share an id
+    # but differ in name[comp] don't collide when add_metabolites runs.
+    pending_ids: set[str] = set()
+    for srx in source_rxns:
+        for met in srx.metabolites:
+            key = _name_comp(met)
+            if key in draft_by_name or key in pending:
+                continue
+            pending.add(key)
+            if met.id not in model.metabolites and met.id not in pending_ids:
+                new_id = met.id
+            else:
+                # _new_met_id only knows the model; loop past in-batch hits too.
+                new_id = _new_met_id(model, "m")
+                while new_id in pending_ids:
+                    n = int(new_id[1:]) + 1
+                    new_id = f"m{n}"
+                    while new_id in model.metabolites:
+                        n += 1
+                        new_id = f"m{n}"
+            pending_ids.add(new_id)
+            new_met = Metabolite(
+                new_id,
+                name=met.name,
+                compartment=met.compartment,
+                formula=met.formula,
+                charge=met.charge,
+            )
+            new_met.annotation = copy.deepcopy(met.annotation)
+            new_met.notes = copy.deepcopy(met.notes)
+            new_mets.append(new_met)
+            draft_by_name[key] = new_met
+    if new_mets:
+        model.add_metabolites(new_mets)
+
+    added: list[Reaction] = []
+    for srx, rule in zip(source_rxns, rules, strict=True):
+        rxn = Reaction(srx.id, name=srx.name)
+        rxn.bounds = srx.bounds
+        rxn.subsystem = srx.subsystem
+        model.add_reactions([rxn])
+        rxn.add_metabolites(
+            {draft_by_name[_name_comp(met)]: coef for met, coef in srx.metabolites.items()}
+        )
+        if rule:
+            rxn.gene_reaction_rule = rule
+        rxn.annotation = copy.deepcopy(srx.annotation)
+        notes = copy.deepcopy(srx.notes)
+        if note is not None:
+            notes["note"] = note
+        if confidence is not None:
+            notes["confidence_score"] = confidence
+        rxn.notes = notes
+        added.append(rxn)
+
+    return added
diff --git a/src/raven_python/manipulation/transport.py b/src/raven_python/manipulation/transport.py
new file mode 100644
index 0000000..d0c1bf1
--- /dev/null
+++ b/src/raven_python/manipulation/transport.py
@@ -0,0 +1,157 @@
+"""Add transport reactions between compartments.
+
+cobra has no transport-reaction primitive. For each metabolite this matches the
+species by *name* across compartments (the source in ``from_compartment`` and its
+same-named twin in each target compartment), optionally creating the target
+metabolite, and
+builds a ``-1 from / +1 to`` reaction with a sequential ``tr_0001`` ID.
+"""
+from __future__ import annotations
+
+import re
+import warnings
+from collections.abc import Iterable
+
+import cobra
+from cobra import Metabolite, Reaction
+
+from raven_python.manipulation.add import _new_met_id
+
+
+def _index_by_name(mets: Iterable[Metabolite], compartment: str) -> dict[str, Metabolite]:
+    """Index metabolites by name, warning when a name is duplicated.
+
+    Same-name duplicates in a single compartment are unusual but legal in cobra,
+    and the previous one-pass dict comprehension silently dropped all but one.
+    """
+    out: dict[str, list[Metabolite]] = {}
+    for m in mets:
+        out.setdefault(m.name, []).append(m)
+    chosen: dict[str, Metabolite] = {}
+    for name, group in out.items():
+        if len(group) > 1:
+            warnings.warn(
+                f"Multiple metabolites named {name!r} in compartment {compartment!r} "
+                f"({[m.id for m in group]}); using {group[0].id!r} for transport.",
+                stacklevel=3,
+            )
+        chosen[name] = group[0]
+    return chosen
+
+
+def _transport_id_factory(model: cobra.Model, prefix: str):
+    pattern = re.compile(rf"^{re.escape(prefix)}(\d+)$")
+    used = [int(m.group(1)) for r in model.reactions if (m := pattern.match(r.id))]
+    counter = max(used) + 1 if used else 1
+
+    def next_id() -> str:
+        nonlocal counter
+        while f"{prefix}{counter:04d}" in model.reactions:
+            counter += 1
+        rid = f"{prefix}{counter:04d}"
+        counter += 1
+        return rid
+
+    return next_id
+
+
+def add_transport_reactions(
+    model: cobra.Model,
+    from_compartment: str,
+    to_compartments: str | Iterable[str],
+    metabolite_names: str | Iterable[str] | None = None,
+    *,
+    reversible: bool = True,
+    only_to_existing: bool = True,
+    id_prefix: str = "tr_",
+) -> list[Reaction]:
+    """Add transport reactions from one compartment to one or more others.
+    Parameters
+    ----------
+    from_compartment
+        Source compartment id.
+    to_compartments
+        Target compartment id(s).
+    metabolite_names
+        Names of metabolites to transport. Default: every metabolite in
+        ``from_compartment``.
+    reversible
+        If True (default), bounds span the cobra configuration default
+        (reversible); otherwise lower bound 0.
+    only_to_existing
+        If True (default), only transport a metabolite into a target
+        compartment where a same-named metabolite already exists. If False,
+        create the missing target metabolite (copying name/formula/charge/
+        annotation from the source) before adding the transport.
+    id_prefix
+        Prefix for the sequential reaction IDs (``tr_0001``, ...).
+
+    Returns
+    -------
+    list of cobra.Reaction
+        The transport reactions added, in creation order.
+    """
+    # cobra's `model.compartments` only lists compartments that have metabolites;
+    # include registered-but-empty ones so transport can target an empty compartment.
+    known = set(model.compartments) | set(model._compartments)
+    if from_compartment not in known:
+        raise ValueError(f"Compartment {from_compartment!r} is not in the model.")
+    if isinstance(to_compartments, str):
+        to_compartments = [to_compartments]
+    else:
+        to_compartments = list(to_compartments)
+    for comp in to_compartments:
+        if comp not in known:
+            raise ValueError(f"Compartment {comp!r} is not in the model.")
+
+    source = _index_by_name(
+        (m for m in model.metabolites if m.compartment == from_compartment),
+        from_compartment,
+    )
+    if metabolite_names is None:
+        names = list(source)
+    else:
+        names = [metabolite_names] if isinstance(metabolite_names, str) else list(metabolite_names)
+        missing = [n for n in names if n not in source]
+        if missing:
+            raise ValueError(
+                f"Metabolites not found in compartment {from_compartment!r}: {missing}"
+            )
+
+    cfg = cobra.Configuration()
+    bounds = (cfg.lower_bound, cfg.upper_bound) if reversible else (0.0, cfg.upper_bound)
+    from_name = model.compartments.get(from_compartment) or from_compartment
+    next_id = _transport_id_factory(model, id_prefix)
+
+    added: list[Reaction] = []
+    for to_comp in to_compartments:
+        to_name = model.compartments.get(to_comp) or to_comp
+        targets = _index_by_name(
+            (m for m in model.metabolites if m.compartment == to_comp),
+            to_comp,
+        )
+        for name in names:
+            src = source[name]
+            dst = targets.get(name)
+            if dst is None:
+                if only_to_existing:
+                    continue
+                dst = Metabolite(
+                    _new_met_id(model, "m"),
+                    name=name,
+                    compartment=to_comp,
+                    formula=src.formula,
+                    charge=src.charge,
+                )
+                dst.annotation = dict(src.annotation)
+                model.add_metabolites([dst])
+                targets[name] = dst
+
+            rxn = Reaction(next_id())
+            rxn.name = f"{name} transport, {from_name}-{to_name}"
+            rxn.bounds = bounds
+            model.add_reactions([rxn])
+            rxn.add_metabolites({src: -1, dst: 1})
+            added.append(rxn)
+
+    return added
diff --git a/tests/test_change_grrules.py b/tests/test_change_grrules.py
new file mode 100644
index 0000000..d33f723
--- /dev/null
+++ b/tests/test_change_grrules.py
@@ -0,0 +1,49 @@
+"""Tests for change_gene_reaction_rules (changeGrRules port)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import add_reactions_from_equations, change_gene_reaction_rules
+
+
+@pytest.fixture
+def model():
+    m = cobra.Model("t")
+    m.add_metabolites(
+        [cobra.Metabolite("a_c", compartment="c"), cobra.Metabolite("b_c", compartment="c")]
+    )
+    add_reactions_from_equations(
+        m,
+        [
+            {"id": "R1", "equation": "a_c --> b_c", "gene_reaction_rule": "G1"},
+            {"id": "R2", "equation": "a_c --> b_c"},
+        ],
+    )
+    return m
+
+
+def test_replace_rule_and_create_genes(model):
+    (rxn,) = change_gene_reaction_rules(model, {"R1": "G2 and G3"})
+    assert rxn.gene_reaction_rule == "G2 and G3"
+    assert {g.id for g in rxn.genes} == {"G2", "G3"}
+    assert {"G2", "G3"} <= {g.id for g in model.genes}
+
+
+def test_append_rule(model):
+    change_gene_reaction_rules(model, {"R1": "G4"}, replace=False)
+    # (G1) or (G4), normalised by cobra
+    assert model.reactions.get_by_id("R1").gene_reaction_rule == "G1 or G4"
+
+
+def test_append_when_empty_is_just_new(model):
+    change_gene_reaction_rules(model, {"R2": "G5"}, replace=False)
+    assert model.reactions.get_by_id("R2").gene_reaction_rule == "G5"
+
+
+def test_batch(model):
+    changed = change_gene_reaction_rules(model, {"R1": "GA", "R2": "GB"})
+    assert [r.id for r in changed] == ["R1", "R2"]
+
+
+def test_unknown_reaction_errors(model):
+    with pytest.raises(ValueError, match="not found"):
+        change_gene_reaction_rules(model, {"NOPE": "G1"})
diff --git a/tests/test_manipulation_add.py b/tests/test_manipulation_add.py
new file mode 100644
index 0000000..2a3a9d3
--- /dev/null
+++ b/tests/test_manipulation_add.py
@@ -0,0 +1,278 @@
+"""Tests for raven_python.manipulation.add (addRxns port)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import add_reactions_from_equations
+from raven_python.utils.parse import parse_name_comp
+
+
+@pytest.fixture
+def model():
+    m = cobra.Model("t")
+    m.add_metabolites(
+        [
+            cobra.Metabolite("atp_c", name="ATP", compartment="c"),
+            cobra.Metabolite("h2o_c", name="H2O", compartment="c"),
+            cobra.Metabolite("adp_c", name="ADP", compartment="c"),
+            cobra.Metabolite("pi_c", name="phosphate", compartment="c"),
+        ]
+    )
+    return m
+
+
+# --- parse_name_comp -------------------------------------------------------
+
+@pytest.mark.parametrize(
+    "token,expected",
+    [
+        ("ATP[c]", ("ATP", "c")),
+        ("ATP", ("ATP", None)),
+        ("  ATP[c] ", ("ATP", "c")),
+        ("weird[name][m]", ("weird[name]", "m")),
+    ],
+)
+def test_parse_name_comp(token, expected):
+    assert parse_name_comp(token) == expected
+
+
+# --- id mode (eqnType 1) ---------------------------------------------------
+
+def test_add_by_id_basic_and_reversibility(model):
+    (rxn,) = add_reactions_from_equations(
+        model, [{"id": "R1", "equation": "atp_c + h2o_c <=> adp_c + pi_c"}]
+    )
+    assert rxn.id == "R1"
+    assert rxn.reversibility is True
+    assert {m.id: rxn.get_coefficient(m.id) for m in rxn.metabolites} == {
+        "atp_c": -1.0,
+        "h2o_c": -1.0,
+        "adp_c": 1.0,
+        "pi_c": 1.0,
+    }
+
+
+def test_irreversible_arrows(model):
+    rxns = add_reactions_from_equations(
+        model,
+        [
+            {"id": "R1", "equation": "atp_c --> adp_c"},
+            {"id": "R2", "equation": "atp_c => adp_c"},
+        ],
+    )
+    for r in rxns:
+        assert r.lower_bound == 0.0
+        assert r.reversibility is False
+
+
+def test_coefficients(model):
+    (rxn,) = add_reactions_from_equations(
+        model, [{"id": "R1", "equation": "2 atp_c + 1.5 h2o_c --> adp_c"}]
+    )
+    assert rxn.get_coefficient("atp_c") == -2.0
+    assert rxn.get_coefficient("h2o_c") == -1.5
+
+
+def test_id_mode_creates_new_met_in_compartment(model):
+    add_reactions_from_equations(
+        model,
+        [{"id": "R1", "equation": "atp_c --> amp_c"}],
+        compartment="c",
+    )
+    assert "amp_c" in model.metabolites
+    assert model.metabolites.get_by_id("amp_c").compartment == "c"
+
+
+def test_id_mode_new_met_without_compartment_errors(model):
+    with pytest.raises(ValueError, match="no compartment"):
+        add_reactions_from_equations(model, [{"id": "R1", "equation": "atp_c --> amp_c"}])
+
+
+# --- name mode (eqnType 2) -------------------------------------------------
+
+def test_name_mode_matches_existing_by_name(model):
+    (rxn,) = add_reactions_from_equations(
+        model,
+        [{"id": "R1", "equation": "ATP + H2O <=> ADP + phosphate"}],
+        mets_by="name",
+        compartment="c",
+    )
+    # resolved to the existing _c metabolites, not new ones
+    assert {m.id for m in rxn.metabolites} == {"atp_c", "h2o_c", "adp_c", "pi_c"}
+    assert len(model.metabolites) == 4
+
+
+def test_name_mode_creates_new_met_with_auto_id(model):
+    add_reactions_from_equations(
+        model,
+        [{"id": "R1", "equation": "ATP --> AMP"}],
+        mets_by="name",
+        compartment="c",
+    )
+    new = [m for m in model.metabolites if m.name == "AMP"]
+    assert len(new) == 1
+    assert new[0].id == "m1"
+    assert new[0].compartment == "c"
+
+
+def test_name_mode_requires_compartment(model):
+    with pytest.raises(ValueError, match="needs a compartment"):
+        add_reactions_from_equations(
+            model, [{"id": "R1", "equation": "ATP --> ADP"}], mets_by="name"
+        )
+
+
+# --- name[comp] mode (eqnType 3) -------------------------------------------
+
+def test_name_comp_syntax(model):
+    model.add_metabolites([cobra.Metabolite("atp_m", name="ATP", compartment="m")])
+    (rxn,) = add_reactions_from_equations(
+        model,
+        [{"id": "R1", "equation": "ATP[c] --> ATP[m]"}],
+        mets_by="name",
+        compartment="c",
+    )
+    # matched ATP in two different compartments by name[comp]
+    assert {m.id for m in rxn.metabolites} == {"atp_c", "atp_m"}
+
+
+# --- genes -----------------------------------------------------------------
+
+def test_gene_rule_auto_creates_genes(model):
+    (rxn,) = add_reactions_from_equations(
+        model,
+        [{"id": "R1", "equation": "atp_c --> adp_c", "gene_reaction_rule": "G1 and G2"}],
+    )
+    assert {g.id for g in rxn.genes} == {"G1", "G2"}
+    assert {g.id for g in model.genes} == {"G1", "G2"}
+
+
+def test_strict_genes_errors_on_unknown(model):
+    with pytest.raises(ValueError, match="genes not in the model"):
+        add_reactions_from_equations(
+            model,
+            [{"id": "R1", "equation": "atp_c --> adp_c", "gene_reaction_rule": "G1"}],
+            allow_new_genes=False,
+        )
+
+
+def test_strict_genes_ok_when_present(model):
+    model.genes.append(cobra.core.gene.Gene("G1"))
+    (rxn,) = add_reactions_from_equations(
+        model,
+        [{"id": "R1", "equation": "atp_c --> adp_c", "gene_reaction_rule": "G1"}],
+        allow_new_genes=False,
+    )
+    assert rxn.gene_reaction_rule == "G1"
+
+
+# --- guards & extras -------------------------------------------------------
+
+def test_duplicate_reaction_id_errors(model):
+    model.add_reactions([cobra.Reaction("R1")])
+    with pytest.raises(ValueError, match="already exists"):
+        add_reactions_from_equations(model, [{"id": "R1", "equation": "atp_c --> adp_c"}])
+
+
+def test_strict_mets_errors(model):
+    with pytest.raises(ValueError, match="allow_new_mets"):
+        add_reactions_from_equations(
+            model,
+            [{"id": "R1", "equation": "atp_c --> amp_c"}],
+            compartment="c",
+            allow_new_mets=False,
+        )
+
+
+def test_explicit_bounds_override_arrow(model):
+    (rxn,) = add_reactions_from_equations(
+        model,
+        [{"id": "R1", "equation": "atp_c <=> adp_c", "bounds": (0, 50), "name": "myrxn"}],
+    )
+    assert rxn.bounds == (0, 50)
+    assert rxn.name == "myrxn"
+
+
+def test_net_zero_metabolite_dropped(model):
+    # atp_c on both sides nets to zero and is removed.
+    (rxn,) = add_reactions_from_equations(
+        model, [{"id": "R1", "equation": "atp_c + h2o_c --> atp_c + adp_c"}]
+    )
+    assert "atp_c" not in {m.id for m in rxn.metabolites}
+    assert {m.id for m in rxn.metabolites} == {"h2o_c", "adp_c"}
+
+
+def test_missing_equation_errors(model):
+    with pytest.raises(ValueError, match="missing required 'equation'"):
+        add_reactions_from_equations(model, [{"id": "R1"}])
+
+
+def test_no_arrow_errors(model):
+    with pytest.raises(ValueError, match="No reaction arrow"):
+        add_reactions_from_equations(model, [{"id": "R1", "equation": "atp_c + h2o_c"}])
+
+
+# --- regression: leading-number metabolite name (known_issues.md A1) -------
+
+def test_name_mode_preserves_leading_number_name(model):
+    """A metabolite name that begins with a number isn't misparsed as a coefficient.
+
+    Before the fix the token ``"2 oxoglutarate"`` was parsed as ``(coeff=2, name="oxoglutarate")``
+    silently — corrupting the stoichiometry. The resolver now prefers the full
+    token when it matches an existing metabolite name.
+    """
+    model.add_metabolites([
+        cobra.Metabolite("akg_c", name="2 oxoglutarate", compartment="c"),
+    ])
+    (rxn,) = add_reactions_from_equations(
+        model,
+        [{"id": "R1", "equation": "ATP + 2 oxoglutarate --> ADP"}],
+        mets_by="name",
+        compartment="c",
+    )
+    assert rxn.get_coefficient("akg_c") == -1.0  # not -2.0
+    assert rxn.get_coefficient("atp_c") == -1.0
+
+
+def test_name_mode_coefficient_still_works_without_collision(model):
+    """If the full token doesn't match anything, fall back to coefficient split."""
+    (rxn,) = add_reactions_from_equations(
+        model,
+        [{"id": "R1", "equation": "2 ATP + H2O --> ADP + phosphate"}],
+        mets_by="name",
+        compartment="c",
+    )
+    assert rxn.get_coefficient("atp_c") == -2.0
+
+
+# --- regression: empty-stoichiometry warning (known_issues.md A2) ----------
+
+def test_empty_stoichiometry_warns(model):
+    """All-terms-cancel reaction warns instead of silently shipping an empty rxn."""
+    with pytest.warns(UserWarning, match="no net metabolites"):
+        (rxn,) = add_reactions_from_equations(
+            model, [{"id": "R1", "equation": "atp_c --> atp_c"}]
+        )
+    assert len(rxn.metabolites) == 0
+
+
+# --- regression: unknown-compartment warning (known_issues.md B2) ----------
+
+def test_id_mode_unknown_compartment_warns(model):
+    """A typo'd compartment used to silently produce a one-met ghost compartment
+    in id mode (the name/[comp] path used to validate, id mode never did)."""
+    with pytest.warns(UserWarning, match="unregistered compartment 'cyto'"):
+        add_reactions_from_equations(
+            model,
+            [{"id": "R1", "equation": "atp_c --> amp_c"}],
+            compartment="cyto",  # typo for 'c'
+        )
+
+
+def test_name_comp_unknown_compartment_warns(model):
+    """Same defensive check in the name[comp] path when allow_new_mets=True."""
+    with pytest.warns(UserWarning, match="unregistered compartment 'mito'"):
+        add_reactions_from_equations(
+            model,
+            [{"id": "R1", "equation": "ATP[c] --> AMP[mito]"}],
+            mets_by="name",
+        )
diff --git a/tests/test_manipulation_change.py b/tests/test_manipulation_change.py
new file mode 100644
index 0000000..8d54f58
--- /dev/null
+++ b/tests/test_manipulation_change.py
@@ -0,0 +1,93 @@
+"""Tests for raven_python.manipulation.change (changeRxns port)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import add_reactions_from_equations, change_reaction_equations
+
+
+@pytest.fixture
+def model():
+    m = cobra.Model("t")
+    m.add_metabolites(
+        [
+            cobra.Metabolite("a_c", name="A", compartment="c"),
+            cobra.Metabolite("b_c", name="B", compartment="c"),
+            cobra.Metabolite("c_c", name="C", compartment="c"),
+        ]
+    )
+    add_reactions_from_equations(
+        m,
+        [
+            {
+                "id": "R1",
+                "equation": "a_c <=> b_c",
+                "name": "first",
+                "bounds": (-30, 70),
+                "gene_reaction_rule": "G1 or G2",
+                "subsystem": "sub",
+            },
+            {"id": "R2", "equation": "a_c --> c_c"},
+        ],
+    )
+    return m
+
+
+def test_changes_stoichiometry(model):
+    (rxn,) = change_reaction_equations(model, {"R1": "a_c --> 2 c_c"})
+    assert rxn.id == "R1"
+    assert {m.id: rxn.get_coefficient(m.id) for m in rxn.metabolites} == {
+        "a_c": -1.0,
+        "c_c": 2.0,
+    }
+
+
+def test_preserves_other_fields(model):
+    before = model.reactions.get_by_id("R1")
+    name, bounds, subsystem = before.name, before.bounds, before.subsystem
+    genes = {g.id for g in before.genes}
+
+    change_reaction_equations(model, {"R1": "a_c --> c_c"})
+
+    after = model.reactions.get_by_id("R1")
+    assert after.name == name
+    assert after.bounds == bounds  # bounds untouched, per RAVEN
+    assert after.subsystem == subsystem
+    assert {g.id for g in after.genes} == genes
+
+
+def test_preserves_reaction_order(model):
+    order_before = [r.id for r in model.reactions]
+    change_reaction_equations(model, {"R1": "b_c --> c_c"})
+    assert [r.id for r in model.reactions] == order_before
+
+
+def test_bounds_not_changed_by_arrow(model):
+    # R1 starts reversible (-30, 70); a --> arrow must NOT make it irreversible.
+    change_reaction_equations(model, {"R1": "a_c --> b_c"})
+    assert model.reactions.get_by_id("R1").bounds == (-30, 70)
+
+
+def test_name_mode(model):
+    (rxn,) = change_reaction_equations(
+        model, {"R2": "A --> C"}, mets_by="name", compartment="c"
+    )
+    assert {m.id for m in rxn.metabolites} == {"a_c", "c_c"}
+
+
+def test_can_introduce_new_met(model):
+    change_reaction_equations(
+        model, {"R2": "a_c --> d_c"}, compartment="c"
+    )
+    assert "d_c" in model.metabolites
+    assert model.reactions.get_by_id("R2").get_coefficient("d_c") == 1.0
+
+
+def test_unknown_reaction_errors(model):
+    with pytest.raises(ValueError, match="not found"):
+        change_reaction_equations(model, {"NOPE": "a_c --> b_c"})
+
+
+def test_multiple_reactions(model):
+    changed = change_reaction_equations(model, {"R1": "a_c --> c_c", "R2": "b_c --> c_c"})
+    assert [r.id for r in changed] == ["R1", "R2"]
+    assert model.reactions.get_by_id("R2").get_coefficient("b_c") == -1.0
diff --git a/tests/test_manipulation_compartments.py b/tests/test_manipulation_compartments.py
new file mode 100644
index 0000000..4d3fb3b
--- /dev/null
+++ b/tests/test_manipulation_compartments.py
@@ -0,0 +1,139 @@
+"""Tests for manipulation/compartments.py — merge_compartments + copy_to_compartment."""
+from __future__ import annotations
+
+import cobra
+import pytest
+
+from raven_python.manipulation.compartments import copy_to_compartment, merge_compartments
+
+
+def _two_compartment_model() -> cobra.Model:
+    """A_c → B_c, A_m → B_m, and a transport A_c ↔ A_m. Multi-compartment toy."""
+    m = cobra.Model("toy")
+    A_c = cobra.Metabolite("A_c", name="A", compartment="c")
+    A_m = cobra.Metabolite("A_m", name="A", compartment="m")
+    B_c = cobra.Metabolite("B_c", name="B", compartment="c")
+    B_m = cobra.Metabolite("B_m", name="B", compartment="m")
+    m.add_metabolites([A_c, A_m, B_c, B_m])
+
+    def rxn(rid, lb, ub, mets, gpr=None):
+        r = cobra.Reaction(rid, lower_bound=lb, upper_bound=ub)
+        r.add_metabolites(mets)
+        if gpr:
+            r.gene_reaction_rule = gpr
+        return r
+    m.add_reactions([rxn("r_c", 0, 1000, {A_c: -1, B_c: 1}, "g1"),
+                     rxn("r_m", 0, 1000, {A_m: -1, B_m: 1}, "g2"),
+                     rxn("tr_A", -1000, 1000, {A_c: -1, A_m: 1})])
+    return m
+
+
+# ----------------------------------------------------------------- merge_compartments
+
+def test_merge_compartments_collapses_to_one():
+    """A_c + A_m → A; B_c + B_m → B; transport A_c↔A_m self-cancels and is dropped."""
+    m = _two_compartment_model()
+    merged, deleted, dupes = merge_compartments(m)
+    # Only the base ids survive.
+    assert {x.id for x in merged.metabolites} == {"A", "B"}
+    # The transport reaction collapsed (A → A) and was deleted.
+    assert "tr_A" in deleted
+    # r_c and r_m are now both A → B; one of them gets deduplicated.
+    surviving = {r.id for r in merged.reactions}
+    assert len(surviving & {"r_c", "r_m"}) == 1
+    assert (set(dupes) | (surviving & {"r_c", "r_m"})) == {"r_c", "r_m"}
+
+
+def test_merge_compartments_preserves_gpr_and_subsystem():
+    m = _two_compartment_model()
+    m.reactions.r_c.subsystem = "carbo"
+    merged, _, _ = merge_compartments(m)
+    survivor = next(r for r in merged.reactions if r.id in {"r_c", "r_m"})
+    # The survivor keeps its gene rule + subsystem (cobra may sometimes lose them
+    # through copy; we set them explicitly).
+    assert survivor.gene_reaction_rule in {"g1", "g2"}
+    if survivor.id == "r_c":
+        assert survivor.subsystem == "carbo"
+
+
+def test_merge_compartments_keeps_single_met_reactions_when_asked():
+    """drop_single_metabolite_reactions=False keeps the collapsed transport (now A → A,
+    which is empty stoichiometry after net-cancellation — still dropped, but the *one-met*
+    case is the more interesting one). Use a uniport pattern to exercise it."""
+    m = cobra.Model("uniport")
+    A_c = cobra.Metabolite("A_c", name="A", compartment="c")
+    A_m = cobra.Metabolite("A_m", name="A", compartment="m")
+    H_c = cobra.Metabolite("H_c", name="H", compartment="c")
+    m.add_metabolites([A_c, A_m, H_c])
+    # H+ symport: A_c + H_c → A_m. After merge: A + H → A → leaves H.
+    sym = cobra.Reaction("sym", lower_bound=0, upper_bound=1000)
+    sym.add_metabolites({A_c: -1, H_c: -1, A_m: 1})
+    m.add_reactions([sym])
+    merged_drop, deleted_drop, _ = merge_compartments(m, drop_single_metabolite_reactions=True)
+    assert "sym" in deleted_drop
+    merged_keep, deleted_keep, _ = merge_compartments(m, drop_single_metabolite_reactions=False)
+    # With keep, sym survives as a one-met reaction (consumes H).
+    assert "sym" not in deleted_keep
+    assert "sym" in {r.id for r in merged_keep.reactions}
+
+
+def test_merge_compartments_deduplicate_off_keeps_both():
+    m = _two_compartment_model()
+    merged, _, dupes = merge_compartments(m, deduplicate_reactions=False)
+    assert dupes == []
+    assert {"r_c", "r_m"} <= {r.id for r in merged.reactions}
+
+
+# ----------------------------------------------------------------- copy_to_compartment
+
+def test_copy_to_compartment_basic():
+    """Copy r_c into 'p' (peroxisome): a new reaction r_c_p with metabolites in p."""
+    m = _two_compartment_model()
+    out, new_rxns, new_mets = copy_to_compartment(m, ["r_c"], "p",
+                                                    target_compartment_name="peroxisome")
+    assert "r_c_p" in [r.id for r in out.reactions]
+    new_r = out.reactions.r_c_p
+    assert {x.compartment for x in new_r.metabolites} == {"p"}
+    assert "A_p" in [x.id for x in out.metabolites]
+    assert "B_p" in [x.id for x in out.metabolites]
+    assert new_rxns == ["r_c_p"]
+    assert set(new_mets) == {"A_p", "B_p"}
+    # Original still there.
+    assert "r_c" in [r.id for r in out.reactions]
+
+
+def test_copy_to_compartment_preserves_gpr_and_bounds():
+    m = _two_compartment_model()
+    out, _, _ = copy_to_compartment(m, ["r_c"], "p")
+    new_r = out.reactions.r_c_p
+    assert new_r.gene_reaction_rule == "g1"
+    assert new_r.lower_bound == 0 and new_r.upper_bound == 1000
+
+
+def test_copy_to_compartment_delete_original_is_a_move():
+    m = _two_compartment_model()
+    out, _, _ = copy_to_compartment(m, ["r_c"], "p", delete_original=True)
+    assert "r_c" not in [r.id for r in out.reactions]
+    assert "r_c_p" in [r.id for r in out.reactions]
+
+
+def test_copy_to_compartment_idempotent():
+    """Calling twice doesn't add the reaction twice."""
+    m = _two_compartment_model()
+    out, _, _ = copy_to_compartment(m, ["r_c"], "p")
+    out2, new_rxns, _ = copy_to_compartment(out, ["r_c"], "p")
+    assert new_rxns == []  # nothing added on second call
+    assert len([r for r in out2.reactions if r.id == "r_c_p"]) == 1
+
+
+def test_copy_to_compartment_unknown_reaction_raises():
+    m = _two_compartment_model()
+    with pytest.raises(ValueError, match="not in model"):
+        copy_to_compartment(m, ["does_not_exist"], "p")
+
+
+def test_copy_to_compartment_custom_suffix():
+    m = _two_compartment_model()
+    out, new_rxns, _ = copy_to_compartment(m, ["r_c"], "p", id_suffix="copy1")
+    assert new_rxns == ["r_c_copy1"]
+    assert "A_copy1" in [x.id for x in out.metabolites]
diff --git a/tests/test_manipulation_expand.py b/tests/test_manipulation_expand.py
new file mode 100644
index 0000000..08cd2f2
--- /dev/null
+++ b/tests/test_manipulation_expand.py
@@ -0,0 +1,288 @@
+"""Tests for expand_model (RAVEN expandModel.m) — splitting isozymes into reactions.
+
+Adopted from geckopy's tests/test_expand.py.
+"""
+import cobra
+
+from raven_python.manipulation import expand_model
+from raven_python.manipulation.expand import _gpr_to_dnf
+
+# --------------------------------------------------------------------------- #
+# DNF conversion (internal helper, worth testing directly)
+# --------------------------------------------------------------------------- #
+
+def _dnf_from_gpr_string(gpr_str: str) -> list[list[str]]:
+    from cobra.core.gene import GPR
+
+    gpr = GPR.from_string(gpr_str)
+    return _gpr_to_dnf(gpr)
+
+
+def test_dnf_empty_gpr():
+    assert _dnf_from_gpr_string("") == []
+
+
+def test_dnf_single_gene():
+    assert _dnf_from_gpr_string("g1") == [["g1"]]
+
+
+def test_dnf_simple_and():
+    assert _dnf_from_gpr_string("g1 and g2") == [["g1", "g2"]]
+
+
+def test_dnf_simple_or():
+    assert _dnf_from_gpr_string("g1 or g2") == [["g1"], ["g2"]]
+
+
+def test_dnf_or_of_ands():
+    assert _dnf_from_gpr_string("(g1 and g2) or (g3 and g4)") == [
+        ["g1", "g2"],
+        ["g3", "g4"],
+    ]
+
+
+def test_dnf_distributes_and_over_or():
+    result = _dnf_from_gpr_string("g1 and (g2 or g3)")
+    assert result == [["g1", "g2"], ["g1", "g3"]]
+
+
+def test_dnf_triple_or():
+    assert _dnf_from_gpr_string("g1 or g2 or g3") == [
+        ["g1"], ["g2"], ["g3"],
+    ]
+
+
+def test_dnf_preserves_gene_order_within_clause():
+    result = _dnf_from_gpr_string("g3 and g1 and g2")
+    assert result == [["g3", "g1", "g2"]]
+
+
+# --------------------------------------------------------------------------- #
+# expand_model
+# --------------------------------------------------------------------------- #
+
+def _build_model(
+    reactions: list[tuple[str, dict[str, float], float, float, str]],
+) -> cobra.Model:
+    """Build from (rxn_id, {met_id: coef}, lb, ub, gpr) tuples."""
+    model = cobra.Model("test")
+    mets: dict[str, cobra.Metabolite] = {}
+    for _, stoich, _, _, _ in reactions:
+        for met_id in stoich:
+            if met_id not in mets:
+                mets[met_id] = cobra.Metabolite(met_id, compartment="c")
+
+    for rxn_id, stoich, lb, ub, gpr in reactions:
+        rxn = cobra.Reaction(rxn_id)
+        rxn.lower_bound = lb
+        rxn.upper_bound = ub
+        rxn.add_metabolites({mets[m]: c for m, c in stoich.items()})
+        if gpr:
+            rxn.gene_reaction_rule = gpr
+        model.add_reactions([rxn])
+    return model
+
+
+def test_does_not_expand_reaction_without_gpr():
+    model = _build_model([("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "")])
+    added = expand_model(model)
+    assert added == []
+    assert "r1" in {r.id for r in model.reactions}
+
+
+def test_does_not_expand_single_and_clause():
+    model = _build_model([
+        ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 and g2"),
+    ])
+    added = expand_model(model)
+    assert added == []
+    r1 = model.reactions.get_by_id("r1")
+    assert r1.gene_reaction_rule == "g1 and g2"
+
+
+def test_does_not_expand_single_gene():
+    model = _build_model([("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1")])
+    added = expand_model(model)
+    assert added == []
+    assert model.reactions.get_by_id("r1").gene_reaction_rule == "g1"
+
+
+def test_splits_simple_or_into_two_reactions():
+    model = _build_model([
+        ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"),
+    ])
+    added = expand_model(model)
+
+    assert added == ["r1_EXP_1", "r1_EXP_2"]
+    rxn_ids = {r.id for r in model.reactions}
+    assert "r1" not in rxn_ids
+    assert "r1_EXP_1" in rxn_ids
+    assert "r1_EXP_2" in rxn_ids
+
+    assert model.reactions.get_by_id("r1_EXP_1").gene_reaction_rule == "g1"
+    assert model.reactions.get_by_id("r1_EXP_2").gene_reaction_rule == "g2"
+
+
+def test_splits_or_of_ands():
+    model = _build_model([
+        ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0,
+         "(g1 and g2) or (g3 and g4)"),
+    ])
+    added = expand_model(model)
+
+    assert added == ["r1_EXP_1", "r1_EXP_2"]
+    assert model.reactions.get_by_id("r1_EXP_1").gene_reaction_rule == "g1 and g2"
+    assert model.reactions.get_by_id("r1_EXP_2").gene_reaction_rule == "g3 and g4"
+
+
+def test_distributes_and_over_or():
+    model = _build_model([
+        ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0,
+         "g1 and (g2 or g3)"),
+    ])
+    added = expand_model(model)
+
+    assert added == ["r1_EXP_1", "r1_EXP_2"]
+    assert model.reactions.get_by_id("r1_EXP_1").gene_reaction_rule == "g1 and g2"
+    assert model.reactions.get_by_id("r1_EXP_2").gene_reaction_rule == "g1 and g3"
+
+
+def test_expanded_reactions_inherit_stoichiometry_and_bounds():
+    model = _build_model([
+        ("r1", {"A": -1.0, "B": 2.0}, -500.0, 1500.0, "g1 or g2"),
+    ])
+    expand_model(model)
+
+    for suffix in ("_EXP_1", "_EXP_2"):
+        rxn = model.reactions.get_by_id(f"r1{suffix}")
+        assert rxn.bounds == (-500.0, 1500.0)
+        stoich = {m.id: c for m, c in rxn.metabolites.items()}
+        assert stoich == {"A": -1.0, "B": 2.0}
+
+
+def test_expanded_reactions_inherit_name_and_subsystem():
+    model = _build_model([
+        ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"),
+    ])
+    r1 = model.reactions.get_by_id("r1")
+    r1.name = "an isozyme-catalyzed reaction"
+    r1.subsystem = "central metabolism"
+
+    expand_model(model)
+
+    for suffix in ("_EXP_1", "_EXP_2"):
+        rxn = model.reactions.get_by_id(f"r1{suffix}")
+        assert rxn.name == "an isozyme-catalyzed reaction"
+        assert rxn.subsystem == "central metabolism"
+
+
+def test_multiple_reactions_expand_independently():
+    model = _build_model([
+        ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"),
+        ("r2", {"B": -1.0, "C": 1.0}, 0.0, 1000.0, "g3 and g4"),
+        ("r3", {"C": -1.0, "D": 1.0}, 0.0, 1000.0,
+         "(g5 and g6) or g7 or (g8 and g9)"),
+    ])
+    added = expand_model(model)
+
+    assert added == sorted([
+        "r1_EXP_1", "r1_EXP_2",
+        "r3_EXP_1", "r3_EXP_2", "r3_EXP_3",
+    ])
+
+    rxn_ids = {r.id for r in model.reactions}
+    assert "r2" in rxn_ids
+    assert "r1" not in rxn_ids
+    assert "r3" not in rxn_ids
+
+    assert model.reactions.get_by_id("r2").gene_reaction_rule == "g3 and g4"
+    assert model.reactions.get_by_id("r3_EXP_1").gene_reaction_rule == "g5 and g6"
+    assert model.reactions.get_by_id("r3_EXP_2").gene_reaction_rule == "g7"
+    assert model.reactions.get_by_id("r3_EXP_3").gene_reaction_rule == "g8 and g9"
+
+
+def test_expanded_reaction_has_correct_gene_set():
+    model = _build_model([
+        ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0,
+         "(g1 and g2) or (g3 and g4)"),
+    ])
+    expand_model(model)
+
+    r1_1 = model.reactions.get_by_id("r1_EXP_1")
+    assert {g.id for g in r1_1.genes} == {"g1", "g2"}
+
+    r1_2 = model.reactions.get_by_id("r1_EXP_2")
+    assert {g.id for g in r1_2.genes} == {"g3", "g4"}
+
+
+def test_expansion_is_idempotent_in_the_no_op_sense():
+    model = _build_model([
+        ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"),
+        ("r2", {"B": -1.0, "C": 1.0}, 0.0, 1000.0, "g3 and g4"),
+    ])
+    expand_model(model)
+    ids_before = {r.id for r in model.reactions}
+
+    second = expand_model(model)
+    assert second == []
+
+    ids_after = {r.id for r in model.reactions}
+    assert ids_after == ids_before
+
+
+def test_empty_model_is_unchanged():
+    model = cobra.Model("empty")
+    assert expand_model(model) == []
+
+
+# --------------------------------------------------------------------------- #
+# Annotation and notes propagation
+# --------------------------------------------------------------------------- #
+
+def test_expanded_reactions_inherit_annotation_and_notes():
+    model = _build_model([
+        ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"),
+    ])
+    r1 = model.reactions.get_by_id("r1")
+    r1.annotation["ec-code"] = "1.2.3.4"
+    r1.annotation["sbo"] = "SBO:0000176"
+    r1.notes["custom"] = "hello"
+
+    expand_model(model)
+
+    for suffix in ("_EXP_1", "_EXP_2"):
+        rxn = model.reactions.get_by_id(f"r1{suffix}")
+        assert rxn.annotation["ec-code"] == "1.2.3.4"
+        assert rxn.annotation["sbo"] == "SBO:0000176"
+        assert rxn.notes["custom"] == "hello"
+
+
+def test_expanded_reaction_annotation_is_independent_of_parent():
+    """Mutating one expanded reaction's annotation must not affect siblings."""
+    model = _build_model([
+        ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"),
+    ])
+    model.reactions.get_by_id("r1").annotation["ec-code"] = ["1.2.3.4"]
+
+    expand_model(model)
+
+    r1_1 = model.reactions.get_by_id("r1_EXP_1")
+    r1_2 = model.reactions.get_by_id("r1_EXP_2")
+    r1_1.annotation["ec-code"].append("9.9.9.9")
+    assert r1_2.annotation["ec-code"] == ["1.2.3.4"]
+
+
+def test_objective_coefficient_preserved_on_expansion():
+    """An expanded reaction's isozyme copies retain the original objective coefficient."""
+    m = cobra.Model("o")
+    a, b = (cobra.Metabolite(x, compartment="c") for x in "ab")
+    m.add_metabolites([a, b])
+    r = cobra.Reaction("r1", lower_bound=0, upper_bound=1000)
+    r.add_metabolites({a: -1, b: 1})
+    r.gene_reaction_rule = "g1 or g2"
+    m.add_reactions([r])
+    m.objective = "r1"  # objective on the soon-to-be-expanded reaction
+
+    expand_model(m)
+    coeffs = {rx.id: rx.objective_coefficient for rx in m.reactions}
+    assert coeffs == {"r1_EXP_1": 1.0, "r1_EXP_2": 1.0}  # objective survives on both copies
diff --git a/tests/test_manipulation_irreversible.py b/tests/test_manipulation_irreversible.py
new file mode 100644
index 0000000..e211fa3
--- /dev/null
+++ b/tests/test_manipulation_irreversible.py
@@ -0,0 +1,144 @@
+"""Tests for convert_to_irreversible (RAVEN convertToIrrev.m).
+
+Adopted from geckopy's tests/test_preprocess.py (the convert_to_irreversible subset).
+Exchange reactions are excluded from the split, matching MATLAB behavior.
+"""
+import cobra
+
+from raven_python.manipulation import convert_to_irreversible
+
+
+def _build_model_with_bounds(
+    reactions: list[tuple[str, dict[str, float], float, float]],
+) -> cobra.Model:
+    """Build from (rxn_id, {met_id: coef}, lb, ub) tuples."""
+    model = cobra.Model("test")
+    mets: dict[str, cobra.Metabolite] = {}
+    for _, stoich, _, _ in reactions:
+        for met_id in stoich:
+            if met_id not in mets:
+                mets[met_id] = cobra.Metabolite(met_id, compartment="c")
+
+    for rxn_id, stoich, lb, ub in reactions:
+        rxn = cobra.Reaction(rxn_id)
+        rxn.lower_bound = lb
+        rxn.upper_bound = ub
+        rxn.add_metabolites({mets[m]: c for m, c in stoich.items()})
+        model.add_reactions([rxn])
+    return model
+
+
+def test_splits_single_reversible_non_exchange():
+    model = _build_model_with_bounds([
+        ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0),
+    ])
+
+    added = convert_to_irreversible(model)
+    assert added == ["r1_REV"]
+
+    fwd = model.reactions.get_by_id("r1")
+    rev = model.reactions.get_by_id("r1_REV")
+
+    assert fwd.bounds == (0.0, 1000.0)
+    assert {m.id: c for m, c in fwd.metabolites.items()} == {"A": -1.0, "B": 1.0}
+
+    assert rev.bounds == (0.0, 500.0)
+    assert {m.id: c for m, c in rev.metabolites.items()} == {"A": 1.0, "B": -1.0}
+
+
+def test_does_not_split_forward_only_reaction():
+    model = _build_model_with_bounds([
+        ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0),
+    ])
+    added = convert_to_irreversible(model)
+    assert added == []
+    assert "r1_REV" not in {r.id for r in model.reactions}
+
+
+def test_does_not_split_exchange_reaction_even_if_reversible():
+    """Exchange reactions (one metabolite) are explicitly excluded from
+    the irreversibility step in MATLAB, regardless of bounds."""
+    model = _build_model_with_bounds([
+        ("EX_A", {"A": -1.0}, -1000.0, 1000.0),
+    ])
+    added = convert_to_irreversible(model)
+    assert added == []
+    ex = model.reactions.get_by_id("EX_A")
+    assert ex.bounds == (-1000.0, 1000.0)
+
+
+def test_splits_multiple_mixed_reactions():
+    model = _build_model_with_bounds([
+        ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0),   # split
+        ("r2", {"B": -2.0, "C": 3.0}, 0.0, 1000.0),      # forward only
+        ("EX_A", {"A": -1.0}, -1000.0, 1000.0),          # exchange
+        ("r3", {"C": -1.0, "D": 1.0}, -200.0, 200.0),    # split
+    ])
+
+    added = convert_to_irreversible(model)
+    assert added == ["r1_REV", "r3_REV"]
+
+    assert model.reactions.get_by_id("r1").bounds == (0.0, 1000.0)
+    assert model.reactions.get_by_id("r1_REV").bounds == (0.0, 500.0)
+    assert model.reactions.get_by_id("r2").bounds == (0.0, 1000.0)
+    assert model.reactions.get_by_id("EX_A").bounds == (-1000.0, 1000.0)
+    assert model.reactions.get_by_id("r3").bounds == (0.0, 200.0)
+    assert model.reactions.get_by_id("r3_REV").bounds == (0.0, 200.0)
+
+
+def test_reverse_reaction_inherits_gpr():
+    model = _build_model_with_bounds([
+        ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0),
+    ])
+    model.reactions.get_by_id("r1").gene_reaction_rule = "g1 and g2"
+
+    convert_to_irreversible(model)
+
+    rev = model.reactions.get_by_id("r1_REV")
+    assert rev.gene_reaction_rule == "g1 and g2"
+    assert {g.id for g in rev.genes} == {"g1", "g2"}
+
+
+def test_forward_reaction_lb_is_clamped_to_zero():
+    """After splitting, the original reaction should have lb = 0,
+    which is what MATLAB's convertToIrrev does."""
+    model = _build_model_with_bounds([
+        ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0),
+    ])
+    convert_to_irreversible(model)
+    assert model.reactions.get_by_id("r1").lower_bound == 0.0
+
+
+def test_no_reverse_reaction_has_negative_bound():
+    """After conversion, no non-exchange reaction may carry negative flux."""
+    model = _build_model_with_bounds([
+        ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0),
+        ("r2", {"B": -1.0, "C": 1.0}, -1000.0, 0.0),      # blocked reverse
+        ("EX_A", {"A": -1.0}, -1000.0, 1000.0),
+    ])
+    convert_to_irreversible(model)
+    for rxn in model.reactions:
+        if rxn.boundary:
+            continue
+        assert rxn.lower_bound >= 0, f"{rxn.id} still has lb < 0"
+
+
+def test_returns_empty_list_when_nothing_to_split():
+    model = _build_model_with_bounds([
+        ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0),
+        ("EX_A", {"A": -1.0}, -1000.0, 1000.0),
+    ])
+    assert convert_to_irreversible(model) == []
+
+
+def test_conversion_is_idempotent_after_first_pass():
+    """Running convert_to_irreversible twice should not create
+    `_REV_REV` reactions, because the first pass already clamped
+    all non-exchange lb to 0."""
+    model = _build_model_with_bounds([
+        ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0),
+    ])
+    convert_to_irreversible(model)
+    second = convert_to_irreversible(model)
+    assert second == []
+    assert "r1_REV_REV" not in {r.id for r in model.reactions}
diff --git a/tests/test_manipulation_merge.py b/tests/test_manipulation_merge.py
new file mode 100644
index 0000000..a430f6e
--- /dev/null
+++ b/tests/test_manipulation_merge.py
@@ -0,0 +1,136 @@
+"""Tests for merge_models (mergeModels port)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import add_reactions_from_equations, merge_models
+
+
+def _model(mid, mets, reactions):
+    m = cobra.Model(mid)
+    m.add_metabolites(mets)
+    add_reactions_from_equations(m, reactions)
+    return m
+
+
+@pytest.fixture
+def model_a():
+    return _model(
+        "A",
+        [
+            cobra.Metabolite("glc_c", name="Glucose", compartment="c"),
+            cobra.Metabolite("g6p_c", name="G6P", compartment="c"),
+        ],
+        [{"id": "HEX", "equation": "glc_c --> g6p_c", "gene_reaction_rule": "GA"}],
+    )
+
+
+@pytest.fixture
+def model_b():
+    # same Glucose[c] compound but a DIFFERENT id
+    return _model(
+        "B",
+        [
+            cobra.Metabolite("glucose_c", name="Glucose", compartment="c"),
+            cobra.Metabolite("lac_c", name="Lactate", compartment="c"),
+        ],
+        [{"id": "LDH", "equation": "glucose_c --> lac_c", "gene_reaction_rule": "GB"}],
+    )
+
+
+def test_unifies_metabolites_by_name_comp(model_a, model_b):
+    merged = merge_models([model_a, model_b])
+    glucoses = [m for m in merged.metabolites if m.name == "Glucose" and m.compartment == "c"]
+    assert len(glucoses) == 1  # glc_c and glucose_c unified
+    # both reactions reference the same merged Glucose object
+    hex_glc = [m for m in merged.reactions.get_by_id("HEX").metabolites if m.name == "Glucose"][0]
+    ldh_glc = [m for m in merged.reactions.get_by_id("LDH").metabolites if m.name == "Glucose"][0]
+    assert hex_glc is ldh_glc
+
+
+def test_match_by_id_keeps_distinct(model_a, model_b):
+    merged = merge_models([model_a, model_b], match_by="id")
+    glucoses = [m for m in merged.metabolites if m.name == "Glucose"]
+    assert len(glucoses) == 2  # glc_c and glucose_c are distinct by id
+
+
+def test_all_reactions_kept(model_a, model_b):
+    merged = merge_models([model_a, model_b])
+    assert {"HEX", "LDH"} <= {r.id for r in merged.reactions}
+
+
+def test_reaction_id_collision_renamed(model_a):
+    # two models with the same reaction id but different chemistry
+    other = _model(
+        "B",
+        [cobra.Metabolite("glc_c", name="Glucose", compartment="c"),
+         cobra.Metabolite("x_c", name="X", compartment="c")],
+        [{"id": "HEX", "equation": "glc_c --> x_c"}],
+    )
+    merged = merge_models([model_a, other])
+    assert "HEX" in {r.id for r in merged.reactions}
+    assert "HEX_B" in {r.id for r in merged.reactions}  # renamed with source id
+
+
+def test_genes_merged(model_a, model_b):
+    merged = merge_models([model_a, model_b])
+    assert {"GA", "GB"} <= {g.id for g in merged.genes}
+
+
+def test_provenance_recorded(model_a, model_b):
+    merged = merge_models([model_a, model_b])
+    assert merged.reactions.get_by_id("HEX").notes["origin"] == "A"
+    assert merged.reactions.get_by_id("LDH").notes["origin"] == "B"
+    assert merged.genes.get_by_id("GA").notes["origin"] == "A"
+
+
+def test_compartments_preserved(model_a):
+    model_a.compartments = {"c": "cytoplasm"}
+    merged = merge_models([model_a, model_a.copy()])
+    assert merged.compartments.get("c") == "cytoplasm"
+
+
+def test_single_model_returns_copy(model_a):
+    merged = merge_models([model_a])
+    assert merged is not model_a
+    assert {r.id for r in merged.reactions} == {r.id for r in model_a.reactions}
+
+
+def test_three_models(model_a, model_b):
+    c = _model("C", [cobra.Metabolite("co2_c", name="CO2", compartment="c")],
+               [{"id": "SINK", "equation": "co2_c -->"}])
+    merged = merge_models([model_a, model_b, c])
+    assert {"HEX", "LDH", "SINK"} <= {r.id for r in merged.reactions}
+
+
+def test_bad_match_by(model_a, model_b):
+    with pytest.raises(ValueError, match="match_by"):
+        merge_models([model_a, model_b], match_by="oops")
+
+
+# --- regression: formula/charge conflict (known_issues.md B1) --------------
+
+def test_formula_conflict_warns():
+    """Two models sharing a name[comp] but with different formulas warn instead
+    of silently keeping the first."""
+    a = _model("A",
+        [cobra.Metabolite("g1", name="Glucose", formula="C6H12O6", compartment="c")],
+        [{"id": "EX_A", "equation": "g1 -->"}])
+    b = _model("B",
+        [cobra.Metabolite("g2", name="Glucose", formula="C6H12O7", compartment="c")],
+        [{"id": "EX_B", "equation": "g2 -->"}])
+    with pytest.warns(UserWarning, match="different formulas"):
+        merged = merge_models([a, b])
+    # The merge still picks the first-seen — the test asserts the warning fired
+    # and the model survives.
+    assert "EX_A" in merged.reactions and "EX_B" in merged.reactions
+
+
+def test_charge_conflict_warns():
+    a = _model("A",
+        [cobra.Metabolite("g1", name="Glucose", formula="C6H12O6", charge=0, compartment="c")],
+        [{"id": "EX_A", "equation": "g1 -->"}])
+    b = _model("B",
+        [cobra.Metabolite("g2", name="Glucose", formula="C6H12O6", charge=-1, compartment="c")],
+        [{"id": "EX_B", "equation": "g2 -->"}])
+    with pytest.warns(UserWarning, match="different charges"):
+        merge_models([a, b])
diff --git a/tests/test_manipulation_remove.py b/tests/test_manipulation_remove.py
new file mode 100644
index 0000000..2b659b9
--- /dev/null
+++ b/tests/test_manipulation_remove.py
@@ -0,0 +1,97 @@
+"""Tests for raven_python.manipulation.remove (removeMets/removeGenes ports)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import (
+    add_reactions_from_equations,
+    remove_genes,
+    remove_metabolites,
+)
+
+
+@pytest.fixture
+def model():
+    m = cobra.Model("t")
+    m.add_metabolites(
+        [
+            cobra.Metabolite("atp_c", name="ATP", compartment="c"),
+            cobra.Metabolite("atp_m", name="ATP", compartment="m"),
+            cobra.Metabolite("adp_c", name="ADP", compartment="c"),
+            cobra.Metabolite("x_c", name="X", compartment="c"),
+        ]
+    )
+    add_reactions_from_equations(
+        m,
+        [
+            {"id": "R1", "equation": "atp_c --> adp_c", "gene_reaction_rule": "G1 and G2"},
+            {"id": "R2", "equation": "atp_c --> x_c", "gene_reaction_rule": "G3 or G4"},
+            {"id": "R3", "equation": "atp_m --> adp_c"},  # no GPR (spontaneous)
+        ],
+    )
+    return m
+
+
+# --- remove_metabolites ----------------------------------------------------
+
+def test_remove_metabolites_by_id(model):
+    remove_metabolites(model, ["x_c"])
+    assert "x_c" not in model.metabolites
+    # reaction kept, just lost the metabolite
+    assert "R2" in model.reactions
+
+
+def test_remove_metabolites_by_name_across_compartments(model):
+    # "ATP" exists in c and m; by_name removes both at once.
+    remove_metabolites(model, ["ATP"], by_name=True)
+    assert "atp_c" not in model.metabolites
+    assert "atp_m" not in model.metabolites
+    assert "adp_c" in model.metabolites
+
+
+def test_remove_metabolites_destructive(model):
+    remove_metabolites(model, ["adp_c"], destructive=True)
+    # R1 and R3 both produced adp_c -> removed
+    assert "adp_c" not in model.metabolites
+    assert "R1" not in model.reactions and "R3" not in model.reactions
+
+
+# --- remove_genes ----------------------------------------------------------
+
+def test_remove_genes_remove_mode(model):
+    blocked = remove_genes(model, ["G1"], blocked_reactions="remove")
+    # R1 = "G1 and G2": removing G1 breaks the complex -> blocked -> removed
+    assert blocked == ["R1"]
+    assert "R1" not in model.reactions
+    assert "R2" in model.reactions  # OR rule unaffected
+
+
+def test_remove_genes_constrain_mode(model):
+    blocked = remove_genes(model, ["G1"], blocked_reactions="constrain")
+    assert blocked == ["R1"]
+    r1 = model.reactions.get_by_id("R1")
+    assert r1.bounds == (0, 0)  # kept but constrained, per RAVEN default
+    assert r1.gene_reaction_rule == ""
+
+
+def test_remove_genes_keep_mode(model):
+    blocked = remove_genes(model, ["G1"], blocked_reactions="keep")
+    assert blocked == ["R1"]
+    r1 = model.reactions.get_by_id("R1")
+    assert r1.gene_reaction_rule == ""
+    assert r1.bounds != (0, 0)  # left untouched
+
+
+def test_remove_genes_or_rule_not_blocked(model):
+    blocked = remove_genes(model, ["G3"], blocked_reactions="remove")
+    # R2 = "G3 or G4": removing G3 leaves G4 -> not blocked
+    assert blocked == []
+    assert model.reactions.get_by_id("R2").gene_reaction_rule == "G4"
+
+
+def test_remove_genes_absent_gene_is_noop(model):
+    assert remove_genes(model, ["NOPE"]) == []
+
+
+def test_remove_genes_bad_policy(model):
+    with pytest.raises(ValueError, match="blocked_reactions"):
+        remove_genes(model, ["G1"], blocked_reactions="explode")
diff --git a/tests/test_manipulation_simplify.py b/tests/test_manipulation_simplify.py
new file mode 100644
index 0000000..586a0c3
--- /dev/null
+++ b/tests/test_manipulation_simplify.py
@@ -0,0 +1,184 @@
+"""Tests for simplifyModel reduction modes."""
+import cobra
+import pytest
+
+from raven_python.manipulation import (
+    add_reactions_from_equations,
+    constrain_reversible_reactions,
+    group_linear_reactions,
+    remove_dead_end_reactions,
+    remove_duplicate_reactions,
+)
+
+# --- remove_dead_end_reactions --------------------------------------------
+
+def test_dead_end_removed():
+    m = cobra.Model("t")
+    m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b", "dead")])
+    add_reactions_from_equations(
+        m,
+        [
+            {"id": "R_in", "equation": " --> a"},
+            {"id": "R1", "equation": "a --> b"},
+            {"id": "R_out", "equation": "b --> "},
+            {"id": "R_dead", "equation": "a --> dead"},  # 'dead' only produced
+        ],
+    )
+    removed_rxns, removed_mets = remove_dead_end_reactions(m)
+    assert "R_dead" in removed_rxns
+    assert "dead" in removed_mets
+    # the productive path survives
+    assert {"R_in", "R1", "R_out"} <= {r.id for r in m.reactions}
+
+
+def test_dead_end_respects_reserved():
+    m = cobra.Model("t")
+    m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "dead")])
+    add_reactions_from_equations(
+        m, [{"id": "R_in", "equation": " --> a"}, {"id": "R_dead", "equation": "a --> dead"}]
+    )
+    removed_rxns, _ = remove_dead_end_reactions(m, reserved=["R_dead"])
+    assert "R_dead" not in removed_rxns
+    assert "R_dead" in {r.id for r in m.reactions}
+
+
+# --- remove_duplicate_reactions -------------------------------------------
+
+def test_duplicates_removed():
+    m = cobra.Model("t")
+    m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b")])
+    add_reactions_from_equations(
+        m,
+        [
+            {"id": "R1", "equation": "a --> b", "bounds": (0, 1000)},
+            {"id": "R2", "equation": "a --> b", "bounds": (0, 1000)},  # duplicate of R1
+            {"id": "R3", "equation": "a --> b", "bounds": (0, 500)},   # different bounds
+        ],
+    )
+    removed = remove_duplicate_reactions(m)
+    assert len(removed) == 1  # one of R1/R2 removed
+    assert {"R3"} <= {r.id for r in m.reactions}
+    assert sum(r.id in ("R1", "R2") for r in m.reactions) == 1
+
+
+def test_duplicates_keep_reserved():
+    m = cobra.Model("t")
+    m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b")])
+    add_reactions_from_equations(
+        m,
+        [
+            {"id": "R1", "equation": "a --> b", "bounds": (0, 1000)},
+            {"id": "R2", "equation": "a --> b", "bounds": (0, 1000)},
+        ],
+    )
+    remove_duplicate_reactions(m, reserved=["R1"])
+    assert "R1" in {r.id for r in m.reactions}  # reserved one kept
+
+
+# --- constrain_reversible_reactions ---------------------------------------
+
+def test_forward_only_reversible_constrained():
+    m = cobra.Model("t")
+    m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b")])
+    add_reactions_from_equations(
+        m,
+        [
+            {"id": "R_in", "equation": " --> a", "bounds": (0, 1000)},
+            {"id": "R1", "equation": "a <=> b", "bounds": (-1000, 1000)},  # can only go fwd
+            {"id": "R_out", "equation": "b --> ", "bounds": (0, 1000)},
+        ],
+    )
+    changed = constrain_reversible_reactions(m)
+    assert "R1" in changed
+    assert m.reactions.get_by_id("R1").lower_bound == 0  # constrained to forward
+
+
+def test_truly_reversible_unchanged():
+    m = cobra.Model("t")
+    m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b")])
+    add_reactions_from_equations(
+        m,
+        [
+            {"id": "R_in", "equation": " <=> a", "bounds": (-1000, 1000)},
+            {"id": "R1", "equation": "a <=> b", "bounds": (-1000, 1000)},
+            {"id": "R_out", "equation": "b <=> ", "bounds": (-1000, 1000)},
+        ],
+    )
+    changed = constrain_reversible_reactions(m)
+    assert "R1" not in changed  # can go both ways
+
+
+# --- group_linear_reactions -----------------------------------------------
+
+def test_linear_chain_merged():
+    m = cobra.Model("t")
+    m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b", "c")])
+    add_reactions_from_equations(
+        m,
+        [
+            {"id": "R1", "equation": "a --> b"},  # b: single producer
+            {"id": "R2", "equation": "b --> c"},  # b: single consumer
+        ],
+    )
+    n_before = len(m.reactions)
+    group_linear_reactions(m)
+    # b is eliminated; R1+R2 merged into one reaction a --> c
+    assert "b" not in m.metabolites
+    assert len(m.reactions) < n_before
+
+
+def test_group_linear_discards_genes():
+    m = cobra.Model("t")
+    m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b", "c")])
+    add_reactions_from_equations(
+        m,
+        [
+            {"id": "R1", "equation": "a --> b", "gene_reaction_rule": "G1"},
+            {"id": "R2", "equation": "b --> c", "gene_reaction_rule": "G2"},
+        ],
+    )
+    group_linear_reactions(m)
+    assert len(m.genes) == 0
+
+
+# --- regression: incremental merge collapses a long chain (known_issues.md D1) ---
+
+def test_group_linear_merges_long_chain_in_one_pass():
+    """The incremental scan still flattens a 5-reaction linear chain — the
+    correctness property the original O(n²·m) restart-after-merge loop had."""
+    m = cobra.Model("t")
+    m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in "abcdef"])
+    add_reactions_from_equations(
+        m,
+        [
+            {"id": "R_in", "equation": " --> a"},
+            {"id": "R1", "equation": "a --> b"},
+            {"id": "R2", "equation": "b --> c"},
+            {"id": "R3", "equation": "c --> d"},
+            {"id": "R4", "equation": "d --> e"},
+            {"id": "R5", "equation": "e --> f"},
+            {"id": "R_out", "equation": "f --> "},
+        ],
+    )
+    group_linear_reactions(m)
+    # All the chain's internal metabolites are gone.
+    assert {x for x in m.metabolites if x.id in {"b", "c", "d", "e"}} == set()
+
+
+# --- regression: NaN FVA on infeasible model (known_issues.md C1) ----------
+
+def test_constrain_reversible_raises_on_infeasible():
+    """An infeasible model produces NaN FVA ranges; the old abs(NaN) < eps
+    check silently treated those as 'truly reversible'. Now raises."""
+    m = cobra.Model("t")
+    a, b = (cobra.Metabolite(x, compartment="c") for x in ("a", "b"))
+    m.add_metabolites([a, b])
+    # Force a contradiction: r requires production AND consumption of a, but
+    # nothing else produces a.
+    r = cobra.Reaction("r", lower_bound=-1, upper_bound=1)
+    r.add_metabolites({a: -1, b: 1})
+    forced = cobra.Reaction("forced", lower_bound=5, upper_bound=10)  # infeasible
+    forced.add_metabolites({a: -1})
+    m.add_reactions([r, forced])
+    with pytest.raises(RuntimeError, match="infeasible"):
+        constrain_reversible_reactions(m)
diff --git a/tests/test_manipulation_transfer.py b/tests/test_manipulation_transfer.py
new file mode 100644
index 0000000..61c2ac9
--- /dev/null
+++ b/tests/test_manipulation_transfer.py
@@ -0,0 +1,137 @@
+"""Tests for add_reactions_from_model (addRxnsGenesMets port)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import add_reactions_from_equations, add_reactions_from_model
+
+
+@pytest.fixture
+def draft():
+    m = cobra.Model("draft")
+    m.add_metabolites(
+        [cobra.Metabolite("glc_c", name="Glucose", formula="C6H12O6", compartment="c")]
+    )
+    # an existing reaction so glc_c is in use and we have an id to test skipping
+    add_reactions_from_equations(m, [{"id": "R_existing", "equation": "glc_c <=>"}])
+    return m
+
+
+@pytest.fixture
+def source():
+    m = cobra.Model("source")
+    m.add_metabolites(
+        [
+            # same name[comp] as draft's glc_c but a DIFFERENT id
+            cobra.Metabolite("glucose_c", name="Glucose", formula="C6H12O6", compartment="c"),
+            cobra.Metabolite("atp_c", name="ATP", formula="C10H16N5O13P3", charge=-4, compartment="c"),
+            cobra.Metabolite("g6p_c", name="G6P", formula="C6H13O9P", compartment="c"),
+        ]
+    )
+    add_reactions_from_equations(
+        m,
+        [
+            {
+                "id": "HEX",
+                "equation": "glucose_c + atp_c --> g6p_c",
+                "name": "hexokinase",
+                "bounds": (0, 1000),
+                "gene_reaction_rule": "G1",
+                "subsystem": "glycolysis",
+            },
+            {"id": "R_existing", "equation": "glucose_c <=>"},  # id already in draft
+        ],
+    )
+    return m
+
+
+def test_metabolite_matched_by_name_comp_not_id(draft, source):
+    add_reactions_from_model(draft, source, "HEX")
+    hex_rxn = draft.reactions.get_by_id("HEX")
+    # Glucose reused from the draft (id glc_c), NOT the source's glucose_c
+    assert "glc_c" in {m.id for m in hex_rxn.metabolites}
+    assert "glucose_c" not in draft.metabolites
+
+
+def test_new_metabolites_added_with_metadata(draft, source):
+    add_reactions_from_model(draft, source, "HEX")
+    assert "atp_c" in draft.metabolites and "g6p_c" in draft.metabolites
+    assert draft.metabolites.get_by_id("g6p_c").formula == "C6H13O9P"
+    assert draft.metabolites.get_by_id("atp_c").charge == -4
+
+
+def test_reaction_copied_with_bounds_and_name(draft, source):
+    (rxn,) = add_reactions_from_model(draft, source, "HEX")
+    assert rxn.id == "HEX"
+    assert rxn.name == "hexokinase"
+    assert rxn.bounds == (0, 1000)
+    assert rxn.subsystem == "glycolysis"
+    assert {m.id: rxn.get_coefficient(m.id) for m in rxn.metabolites} == {
+        "glc_c": -1.0,
+        "atp_c": -1.0,
+        "g6p_c": 1.0,
+    }
+
+
+def test_genes_true_copies_gpr_and_creates_genes(draft, source):
+    add_reactions_from_model(draft, source, "HEX", genes=True)
+    assert draft.reactions.get_by_id("HEX").gene_reaction_rule == "G1"
+    assert "G1" in draft.genes
+
+
+def test_genes_false_no_gpr(draft, source):
+    add_reactions_from_model(draft, source, "HEX", genes=False)
+    assert draft.reactions.get_by_id("HEX").gene_reaction_rule == ""
+
+
+def test_genes_string_override(draft, source):
+    add_reactions_from_model(draft, source, "HEX", genes="G9 or G10")
+    assert draft.reactions.get_by_id("HEX").gene_reaction_rule == "G9 or G10"
+
+
+def test_skips_already_present(draft, source):
+    added = add_reactions_from_model(draft, source, ["HEX", "R_existing"])
+    assert [r.id for r in added] == ["HEX"]
+
+
+def test_all_present_raises(draft, source):
+    with pytest.raises(ValueError, match="already in the model"):
+        add_reactions_from_model(draft, source, "R_existing")
+
+
+def test_unknown_source_reaction_raises(draft, source):
+    with pytest.raises(ValueError, match="not found in the source model"):
+        add_reactions_from_model(draft, source, "NOPE")
+
+
+def test_note_and_confidence_stored(draft, source):
+    (rxn,) = add_reactions_from_model(draft, source, "HEX", note="from KEGG", confidence=2)
+    assert rxn.notes["note"] == "from KEGG"
+    assert rxn.notes["confidence_score"] == 2
+
+
+# --- regression: intra-batch met-id minting collision (known_issues.md A3) ---
+
+def test_intra_batch_id_minting_unique():
+    """Two source mets whose ids both collide with the draft and whose name[comp]
+    differs both get routed through new-id minting. The fix tracks ids minted in
+    the current batch so the two don't collapse to the same generated id."""
+    draft = cobra.Model("draft")
+    draft.add_metabolites([
+        cobra.Metabolite("atp_c", name="ATP-draft", compartment="c"),
+        cobra.Metabolite("adp_c", name="ADP-draft", compartment="c"),
+    ])
+    source = cobra.Model("source")
+    source.add_metabolites([
+        cobra.Metabolite("atp_c", name="ATP-source", compartment="c"),
+        cobra.Metabolite("adp_c", name="ADP-source", compartment="c"),
+    ])
+    rxn = cobra.Reaction("R1", lower_bound=0, upper_bound=1000)
+    source.add_reactions([rxn])
+    rxn.add_metabolites({
+        source.metabolites.get_by_id("atp_c"): -1,
+        source.metabolites.get_by_id("adp_c"): 1,
+    })
+    add_reactions_from_model(draft, source, "R1")
+    # Both source mets minted distinct ids (m1 and m2) — not a collision.
+    new_ids = sorted(m.id for m in draft.metabolites if m.id not in ("atp_c", "adp_c"))
+    assert len(new_ids) == 2 and len(set(new_ids)) == 2
diff --git a/tests/test_manipulation_transport.py b/tests/test_manipulation_transport.py
new file mode 100644
index 0000000..e8fb2b6
--- /dev/null
+++ b/tests/test_manipulation_transport.py
@@ -0,0 +1,98 @@
+"""Tests for add_transport_reactions (addTransport port)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import add_transport_reactions
+
+
+@pytest.fixture
+def model():
+    m = cobra.Model("t")
+    m.compartments = {"c": "cytoplasm", "m": "mitochondrion", "e": "extracellular"}
+    m.add_metabolites(
+        [
+            cobra.Metabolite("atp_c", name="ATP", formula="C10H16N5O13P3", charge=-4, compartment="c"),
+            cobra.Metabolite("h2o_c", name="H2O", formula="H2O", compartment="c"),
+            cobra.Metabolite("atp_m", name="ATP", compartment="m"),  # exists in m
+        ]
+    )
+    return m
+
+
+def test_basic_transport_to_existing(model):
+    added = add_transport_reactions(model, "c", "m", ["ATP"])
+    assert len(added) == 1
+    rxn = added[0]
+    assert rxn.id == "tr_0001"
+    assert rxn.name == "ATP transport, cytoplasm-mitochondrion"
+    assert {m.id: rxn.get_coefficient(m.id) for m in rxn.metabolites} == {
+        "atp_c": -1.0,
+        "atp_m": 1.0,
+    }
+    assert rxn.reversibility is True
+
+
+def test_only_to_existing_skips_missing(model):
+    # H2O is not in m; with only_to_existing (default) it's skipped
+    added = add_transport_reactions(model, "c", "m", ["ATP", "H2O"])
+    assert [r.id for r in added] == ["tr_0001"]  # only ATP
+
+
+def test_creates_missing_target_metabolite(model):
+    added = add_transport_reactions(
+        model, "c", "m", ["H2O"], only_to_existing=False
+    )
+    assert len(added) == 1
+    new = [mt for mt in model.metabolites if mt.name == "H2O" and mt.compartment == "m"]
+    assert len(new) == 1
+    assert new[0].formula == "H2O"  # copied from source
+
+
+def test_copies_formula_and_charge(model):
+    add_transport_reactions(model, "c", "e", ["ATP"], only_to_existing=False)
+    new = [mt for mt in model.metabolites if mt.name == "ATP" and mt.compartment == "e"][0]
+    assert new.formula == "C10H16N5O13P3"
+    assert new.charge == -4
+
+
+def test_irreversible(model):
+    (rxn,) = add_transport_reactions(model, "c", "m", ["ATP"], reversible=False)
+    assert rxn.lower_bound == 0
+    assert rxn.reversibility is False
+
+
+def test_default_all_metabolites_in_from(model):
+    # default metabolite_names = all in c (ATP, H2O); to m, only_to_existing -> only ATP
+    added = add_transport_reactions(model, "c", "m")
+    assert [r.id for r in added] == ["tr_0001"]
+
+
+def test_multiple_target_compartments_and_sequential_ids(model):
+    added = add_transport_reactions(
+        model, "c", ["m", "e"], ["ATP"], only_to_existing=False
+    )
+    assert [r.id for r in added] == ["tr_0001", "tr_0002"]
+
+
+def test_unknown_compartment_raises(model):
+    with pytest.raises(ValueError, match="not in the model"):
+        add_transport_reactions(model, "x", "m", ["ATP"])
+
+
+def test_unknown_metabolite_raises(model):
+    with pytest.raises(ValueError, match="not found in compartment"):
+        add_transport_reactions(model, "c", "m", ["NOPE"])
+
+
+# --- regression: duplicate name in compartment (known_issues.md A4) --------
+
+def test_duplicate_name_in_source_compartment_warns(model):
+    """Two source mets sharing a name in the same compartment warn instead of
+    silently collapsing — previously one was dropped from the lookup dict."""
+    model.add_metabolites([
+        cobra.Metabolite("h2o2_c", name="H2O", compartment="c"),  # duplicate name
+    ])
+    with pytest.warns(UserWarning, match="Multiple metabolites named 'H2O'"):
+        added = add_transport_reactions(model, "c", "m", ["H2O"], only_to_existing=False)
+    # Transport still works (uses the first match) — the warning is the signal.
+    assert len(added) == 1
diff --git a/tests/test_parameters.py b/tests/test_parameters.py
new file mode 100644
index 0000000..c0ab06c
--- /dev/null
+++ b/tests/test_parameters.py
@@ -0,0 +1,60 @@
+"""Tests for set_variance_bounds (the var mode of setParam)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import add_reactions_from_equations, set_variance_bounds
+
+
+@pytest.fixture
+def model():
+    m = cobra.Model("t")
+    m.add_metabolites(
+        [cobra.Metabolite("a_c", compartment="c"), cobra.Metabolite("b_c", compartment="c")]
+    )
+    add_reactions_from_equations(
+        m,
+        [
+            {"id": "R1", "equation": "a_c <=> b_c"},
+            {"id": "R2", "equation": "a_c <=> b_c"},
+        ],
+    )
+    return m
+
+
+def test_band_positive(model):
+    set_variance_bounds(model, "R1", 100, 5)  # 97.5 .. 102.5
+    lb, ub = model.reactions.get_by_id("R1").bounds
+    assert lb == pytest.approx(97.5)
+    assert ub == pytest.approx(102.5)
+
+
+def test_band_negative_is_ordered(model):
+    set_variance_bounds(model, "R1", -100, 5)
+    lb, ub = model.reactions.get_by_id("R1").bounds
+    assert lb == pytest.approx(-102.5)
+    assert ub == pytest.approx(-97.5)
+    assert lb <= ub
+
+
+def test_broadcast_scalar(model):
+    set_variance_bounds(model, ["R1", "R2"], 50, 10)
+    for rid in ("R1", "R2"):
+        lb, ub = model.reactions.get_by_id(rid).bounds
+        assert lb == pytest.approx(47.5)
+        assert ub == pytest.approx(52.5)
+
+
+def test_per_reaction_values(model):
+    set_variance_bounds(model, ["R1", "R2"], [100, 200], 0)
+    assert model.reactions.get_by_id("R1").bounds == pytest.approx((100, 100))
+    assert model.reactions.get_by_id("R2").bounds == pytest.approx((200, 200))
+
+
+def test_length_mismatch_raises(model):
+    with pytest.raises(ValueError, match="to match the reactions"):
+        set_variance_bounds(model, ["R1", "R2"], [1, 2, 3], 5)
+
+
+def test_unknown_reaction_raises(model):
+    with pytest.raises(ValueError, match="not found"):
+        set_variance_bounds(model, "NOPE", 1, 5)

From a1dc557ece5d83d38466e96d676eeec66f90b1bb Mon Sep 17 00:00:00 2001
From: Eduard Kerkhoven <eduardk@chalmers.se>
Date: Fri, 29 May 2026 22:50:05 +0200
Subject: [PATCH 3/3] Add binary + data resolvers for external tools and
 published artefacts

---
 docs/maintaining_binaries.md     | 236 +++++++++++++++++++++++++++++++
 scripts/make_registry_snippet.py | 102 +++++++++++++
 src/raven_python/binaries.py     | 148 +++++++++++++++++++
 src/raven_python/data.py         | 135 ++++++++++++++++++
 tests/test_binaries.py           |  80 +++++++++++
 tests/test_data.py               |  89 ++++++++++++
 tests/test_scripts_registry.py   |  58 ++++++++
 7 files changed, 848 insertions(+)
 create mode 100644 docs/maintaining_binaries.md
 create mode 100644 scripts/make_registry_snippet.py
 create mode 100644 src/raven_python/binaries.py
 create mode 100644 src/raven_python/data.py
 create mode 100644 tests/test_binaries.py
 create mode 100644 tests/test_data.py
 create mode 100644 tests/test_scripts_registry.py

diff --git a/docs/maintaining_binaries.md b/docs/maintaining_binaries.md
new file mode 100644
index 0000000..df5b315
--- /dev/null
+++ b/docs/maintaining_binaries.md
@@ -0,0 +1,236 @@
+# Maintaining bundled binaries (BLAST+, DIAMOND, …)
+
+Audience: **raven-python maintainers / the GitHub repo owner.** This explains how
+raven-python ships external command-line tools, how to update their versions, and how
+to build **minimal-footprint** ZIPs to attach to a GitHub release.
+
+> End users never read this. They get a binary automatically via `ensure_binary`,
+> or use their own (system/conda) install. This doc is only for whoever publishes
+> the release assets.
+
+---
+
+## 1. How binary provisioning works
+
+raven-python does **not** vendor binaries in the git repo or on PyPI. Instead:
+
+1. For each tool we publish **version-pinned ZIPs as GitHub release assets**.
+2. A **registry** (`src/raven_python/binaries_registry.json`) maps each *bundle* to its
+   version, the executables it provides, and per-platform `{asset, sha256}`.
+3. At run time `raven_python.binaries.ensure_binary("blastp")` resolves a tool in this
+   order — and only reaches the download as a last resort:
+
+   ```
+   explicit binary= arg  →  env var (RAVEN_PYTHON_BLASTP / RAVEN_PYTHON_DIAMOND / …)
+     →  shutil.which on PATH (system / conda / apt / brew)
+     →  ensure_binary: download the pinned ZIP → verify SHA256 → cache → return path
+     →  actionable error (with conda / manual instructions)
+   ```
+
+So a pre-installed binary always wins; the bundle is the zero-setup fallback.
+Pinning the version makes reconstruction **reproducible**.
+
+A *bundle* can provide several executables from one download (e.g. the `blast`
+bundle provides both `blastp` and `makeblastdb`), so they are fetched once.
+
+---
+
+## 2. What raven-python actually needs — ship only these
+
+Distribute the **minimum** set of executables. Everything else (other suite
+tools, docs, examples, changelogs) must be excluded.
+
+| Bundle | Executables to include | Everything else |
+|---|---|---|
+| `diamond` | `diamond` | — (it is a single static binary) |
+| `blast` | `blastp`, `makeblastdb` | **drop** `blastn`, `tblastn`, `psiblast`, `rpsblast`, `blast_formatter`, `*_vdb`, the `doc/`, `ChangeLog`, `README`, ~30 other tools |
+
+(Confirmed against RAVEN `getBlast`/`getDiamond`: only `makeblastdb`+`blastp`, and
+`diamond` for its `makedb`/`blastp` subcommands, are ever invoked.)
+
+For BLAST+ this is the big win: the full NCBI suite is ~hundreds of MB; two
+binaries (stripped) are a small fraction.
+
+---
+
+## 3. Asset & ZIP conventions
+
+**Asset filename:** `<bundle>-<version>-<os>-<arch>.zip`
+
+- `<os>` ∈ `linux`, `macos`, `windows`
+- `<arch>` ∈ `x86_64`, `arm64`
+- examples: `diamond-2.1.11-linux-x86_64.zip`, `blast-2.16.0-macos-arm64.zip`
+
+**ZIP layout — flat, executables at the root, plus the upstream licence:**
+
+```
+diamond-2.1.11-linux-x86_64.zip
+├── diamond
+└── LICENSE
+
+blast-2.16.0-linux-x86_64.zip
+├── blastp
+├── makeblastdb
+└── LICENSE
+```
+
+No nested `bin/`, no extra files. `ensure_binary` extracts the ZIP into the cache
+and expects the executable at the top level.
+
+---
+
+## 4. Step-by-step: add or update a version
+
+Example: bump DIAMOND to a new version for Linux x86-64. Repeat per `(os, arch)`.
+
+1. **Download the official upstream build** (never rebuild from source unless you
+   must):
+   - DIAMOND → <https://github.com/bbuchfink/diamond/releases>
+     (`diamond-linux64.tar.gz`, `diamond-macos.tar.gz`)
+   - BLAST+ → <https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/> or a
+     pinned version dir (`ncbi-blast-<ver>+-x64-linux.tar.gz`,
+     `-x64-macosx.tar.gz`, `-aarch64-linux.tar.gz`, `-x64-win64.tar.gz`).
+   - Record the upstream URL **and** its published checksum for provenance.
+2. **Extract only the needed executables** (see §2) to a clean staging dir.
+3. **Strip debug symbols** to shrink (skip on Windows / signed macOS builds):
+   ```bash
+   strip diamond           # or: strip blastp makeblastdb
+   ```
+4. **Smoke-test the stripped binaries in a clean shell** (no other tools on PATH):
+   ```bash
+   ./diamond --version
+   ./blastp -version && ./makeblastdb -version
+   ```
+   If they fail for a missing shared library, add that `.so`/`.dylib` to the ZIP
+   (rare — NCBI/DIAMOND release builds are largely self-contained).
+5. **Add the upstream licence file** as `LICENSE` (see §6).
+6. **Zip with max compression, flat layout:**
+   ```bash
+   zip -9 -j diamond-2.1.11-linux-x86_64.zip diamond LICENSE
+   # -j junks paths so entries sit at the ZIP root
+   ```
+7. **Compute the SHA256:**
+   ```bash
+   sha256sum diamond-2.1.11-linux-x86_64.zip   # shasum -a 256 on macOS
+   ```
+8. **Attach the ZIP to a raven-python GitHub release** (a release tagged for the binary
+   set, e.g. `binaries-2024.06`, keeps them independent of code releases).
+9. **Update the registry** `src/raven_python/binaries_registry.json` — bump `version`
+   and set the per-platform `asset` + `sha256`:
+   ```json
+   {
+     "diamond": {
+       "version": "2.1.11",
+       "provides": ["diamond"],
+       "platforms": {
+         "linux-x86_64": {
+           "asset": "diamond-2.1.11-linux-x86_64.zip",
+           "url": "https://github.com/SysBioChalmers/raven-python/releases/download/binaries-2024.06/diamond-2.1.11-linux-x86_64.zip",
+           "sha256": "<sha256>"
+         }
+       }
+     },
+     "blast": {
+       "version": "2.16.0",
+       "provides": ["blastp", "makeblastdb"],
+       "platforms": { "linux-x86_64": { "asset": "...", "url": "...", "sha256": "..." } }
+     }
+   }
+   ```
+10. **Commit the registry change**, run the homology tests, and (if you have the
+    binary) confirm `ensure_binary("diamond", version="2.1.11")` downloads,
+    verifies, and runs.
+
+---
+
+## 5. Keeping the footprint minimal — checklist
+
+- ✅ Only the executables in §2 (for BLAST+, exactly `blastp` + `makeblastdb`).
+- ✅ `strip` the binaries (often halves their size).
+- ✅ `zip -9 -j` (max compression, flat — no `bin/`, no folders).
+- ✅ Exactly one extra file: `LICENSE`.
+- ❌ No docs, examples, `ChangeLog`, `README`, man pages, test data, or sibling tools.
+- ❌ No `.dSYM`/debug bundles; no duplicate static `.a` libraries.
+- ➕ Only add a shared library if step-4 testing proves it is required.
+
+---
+
+## 6. Platform / architecture matrix & licensing
+
+**Coverage = what you build.** Start with `linux-x86_64` (CI default), then add
+`macos-arm64`, `macos-x86_64`, `linux-arm64`, `windows-x86_64` as capacity allows.
+For any `(os, arch)` **not** in the registry, `ensure_binary` raises an actionable
+error pointing to conda (`conda install -c bioconda diamond blast`) or a manual
+install — that is the documented fallback, not a failure to fix urgently.
+
+**Licensing (must comply when redistributing):**
+
+- **BLAST+** — produced by NCBI (US Government); **public domain**, free to
+  redistribute. Include NCBI's `LICENSE` for courtesy/provenance.
+- **DIAMOND** — **GPLv3**. Redistribution is allowed; you **must** include the
+  GPLv3 licence text in the ZIP and keep the binary unmodified (or offer source).
+- **HMMER** (future) — BSD-3-Clause; include its `LICENSE`.
+
+Always ship the upstream licence in the ZIP, and keep a `BINARIES_PROVENANCE.md`
+(or a note in the release body) recording, per asset: upstream URL, upstream
+version, upstream checksum, and the SHA256 you published.
+
+### Native OS support per tool
+
+raven-python invokes each tool through `subprocess.run([resolved_path, …])` — that
+call is itself cross-platform, so the real constraint is whether a given tool has
+a binary that runs natively on each OS. It varies:
+
+| Tool | Linux | macOS (incl. arm64) | Windows (native) |
+|---|---|---|---|
+| BLAST+ (`blastp`, `makeblastdb`) | ✅ | ✅ | ✅ (NCBI ships Windows builds) |
+| DIAMOND | ✅ | ✅ | ⚠️ native build exists but Linux-first |
+| HMMER (`hmmbuild`/`hmmpress`/`hmmsearch`/`hmmscan`) | ✅ | ✅ | ❌ no official native build |
+| MAFFT | ✅ | ✅ | ⚠️ Windows package is a wrapper |
+| CD-HIT | ✅ | ✅ | ❌ no Windows build exists |
+
+Implications:
+
+- **Linux / macOS** — everything works. `conda install -c bioconda hmmer mafft
+  cd-hit blast diamond`, or point the `RAVEN_PYTHON_*` env vars at your installs.
+- **Native Windows** — the homology track (BLAST+/DIAMOND) works, but the **KEGG
+  HMM build (3b.3) and HMM query (3b.5) do not**: HMMER and CD-HIT have no Windows
+  binaries, and bioconda has no Windows packages for any of them. Bundling can't
+  fix this — there is no binary to bundle.
+- **Windows users should run raven-python inside WSL2** (or a Linux container), where
+  every tool is native Linux. raven-python does **not** replicate RAVEN's
+  `getWSLpath`/`wsl …` path translation: it calls the resolved binary directly, so
+  mixing native-Windows Python with WSL binaries is unsupported — keep the whole
+  stack inside WSL2.
+- The common end-user paths — homology reconstruction and the KEGG *species* model
+  (3b.4) — need no HMMER/MAFFT/CD-HIT, so they are fully cross-platform.
+
+---
+
+## 7. Emitting the registry entry
+
+After building the per-platform ZIPs (named `<bundle>-<version>-<os>-<arch>.zip`)
+and uploading them to the release, generate the `_REGISTRY` entry — checksums and
+URLs — with [`scripts/make_registry_snippet.py`](../scripts/README.md):
+
+```bash
+python scripts/make_registry_snippet.py binary --bundle blast --version 2.16.0 \
+    --provides blastp makeblastdb --dir zips \
+    --base-url https://github.com/ORG/raven-python/releases/download/blast-2.16.0
+```
+
+It prints the ready-to-paste `_REGISTRY["blast"]` block; its SHA256 helper is the
+same one `ensure_binary` verifies with, so the checksums always match. (Producing
+the minimal ZIPs themselves — download upstream, `strip`, `zip -9 -j`, add
+`LICENSE` per §3–§6 — is still a manual/per-tool step.)
+
+---
+
+## 8. Adding a new tool later (e.g. HMMER for KEGG reconstruction)
+
+1. Decide the **minimal executable set** (e.g. HMMER → `hmmsearch`, `hmmscan`,
+   maybe `hmmbuild`/`hmmpress`).
+2. Add a bundle entry to the registry with `provides` listing those executables.
+3. Build/attach ZIPs per §3–§4; include the tool's licence (§6).
+4. The wrappers call `ensure_binary("hmmsearch", …)` with the same resolution
+   order — no new provisioning code needed.
diff --git a/scripts/make_registry_snippet.py b/scripts/make_registry_snippet.py
new file mode 100644
index 0000000..3efa49e
--- /dev/null
+++ b/scripts/make_registry_snippet.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python
+"""Emit ready-to-paste registry entries for published artefacts / binary ZIPs.
+
+Computes the SHA256 of each file and prints the Python/JSON entry to merge into
+``raven_python.data._DATA_REGISTRY`` (data artefacts) or ``raven_python.binaries._REGISTRY``
+(binary bundles). Run once per release, after uploading the files to the release.
+
+Examples
+--------
+Data artefacts (KEGG reference model + tables + HMM libraries) for one release::
+
+    python scripts/make_registry_snippet.py data \\
+        --dataset kegg --version kegg116 --dir artefacts \\
+        --base-url https://github.com/ORG/raven_python/releases/download/kegg-data-kegg116
+
+Binary bundle (one ZIP per platform, named ``<bundle>-<version>-<os>-<arch>.zip``)::
+
+    python scripts/make_registry_snippet.py binary \\
+        --bundle blast --version 2.16.0 --provides blastp makeblastdb --dir zips \\
+        --base-url https://github.com/ORG/raven_python/releases/download/blast-2.16.0
+
+The SHA256 helper is shared with the runtime resolvers (``raven_python.binaries``), so
+published checksums always match what ``ensure_data`` / ``ensure_binary`` verify.
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+from raven_python.binaries import _sha256
+
+
+def _files_in(directory: Path) -> list[Path]:
+    """Regular, non-hidden files in ``directory``, sorted by name."""
+    return sorted(p for p in directory.iterdir() if p.is_file() and not p.name.startswith("."))
+
+
+def data_entry(dataset: str, version: str, base_url: str, directory: Path) -> dict:
+    """Build the ``_DATA_REGISTRY[dataset]`` entry for every file in ``directory``."""
+    base = base_url.rstrip("/")
+    files = {
+        p.name: {"url": f"{base}/{p.name}", "sha256": _sha256(p)} for p in _files_in(directory)
+    }
+    if not files:
+        raise SystemExit(f"No files found in {directory}")
+    return {"version": version, "files": files}
+
+
+def binary_entry(
+    bundle: str, version: str, provides: list[str], base_url: str, directory: Path
+) -> dict:
+    """Build the ``_REGISTRY[bundle]`` entry from ``<bundle>-<version>-<os>-<arch>.zip``."""
+    base = base_url.rstrip("/")
+    prefix = f"{bundle}-{version}-"
+    platforms = {}
+    for zip_path in directory.glob(f"{prefix}*.zip"):
+        platform = zip_path.name[len(prefix) : -len(".zip")]
+        platforms[platform] = {"url": f"{base}/{zip_path.name}", "sha256": _sha256(zip_path)}
+    if not platforms:
+        raise SystemExit(f"No {prefix}*.zip files found in {directory}")
+    return {"version": version, "provides": provides, "platforms": dict(sorted(platforms.items()))}
+
+
+def render(key: str, entry: dict) -> str:
+    """Render ``{key: entry}`` as an indented JSON block (valid Python to paste)."""
+    return json.dumps({key: entry}, indent=4)
+
+
+def main(argv: list[str] | None = None) -> None:
+    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    sub = parser.add_subparsers(dest="kind", required=True)
+
+    d = sub.add_parser("data", help="data-artefact registry entry (raven_python.data)")
+    d.add_argument("--dataset", required=True, help="dataset key, e.g. 'kegg'")
+    d.add_argument("--version", required=True)
+    d.add_argument("--dir", required=True, type=Path, help="directory of uploaded artefacts")
+    d.add_argument("--base-url", required=True, help="release download URL prefix")
+
+    b = sub.add_parser("binary", help="binary-bundle registry entry (raven_python.binaries)")
+    b.add_argument("--bundle", required=True, help="bundle key, e.g. 'blast'")
+    b.add_argument("--version", required=True)
+    b.add_argument("--provides", nargs="+", required=True, help="executables the bundle provides")
+    b.add_argument("--dir", required=True, type=Path, help="directory of uploaded ZIPs")
+    b.add_argument("--base-url", required=True, help="release download URL prefix")
+
+    args = parser.parse_args(argv)
+    if args.kind == "data":
+        key, entry = args.dataset, data_entry(args.dataset, args.version, args.base_url, args.dir)
+        target = "raven_python/data.py  _DATA_REGISTRY"
+    else:
+        key = args.bundle
+        entry = binary_entry(args.bundle, args.version, args.provides, args.base_url, args.dir)
+        target = "raven_python/binaries.py  _REGISTRY"
+
+    print(f"# Merge into {target}:", file=sys.stderr)
+    print(render(key, entry))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/raven_python/binaries.py b/src/raven_python/binaries.py
new file mode 100644
index 0000000..2d4b5a2
--- /dev/null
+++ b/src/raven_python/binaries.py
@@ -0,0 +1,148 @@
+"""Locate and provision external command-line binaries (BLAST+, DIAMOND, …).
+
+Shared across tools (not homology-specific). Resolution order for any executable:
+
+    explicit path arg  →  env var (RAVEN_PYTHON_<TOOL>)  →  shutil.which (PATH)
+      →  ensure_binary  (download the version-pinned ZIP from a raven_python release,
+                         verify SHA256, cache, return the path)
+      →  FileNotFoundError with install guidance
+
+So a pre-installed/conda binary always wins; the bundled ZIP is the zero-setup
+fallback. See docs/maintaining_binaries.md for how the release ZIPs and the
+registry are produced and updated.
+"""
+from __future__ import annotations
+
+import hashlib
+import os
+import platform
+import shutil
+import zipfile
+from pathlib import Path
+from urllib.request import urlopen
+
+# Registry of bundled binaries. Empty until release ZIPs are published; populated
+# per docs/maintaining_binaries.md. Keyed by *bundle*; one bundle can provide
+# several executables (e.g. "blast" -> blastp + makeblastdb).
+#   bundle -> {version, provides:[exe...], platforms:{"<os>-<arch>": {url, sha256}}}
+_REGISTRY: dict = {}
+
+# Environment variable overrides per executable.
+_ENV_VARS = {
+    "diamond": "RAVEN_PYTHON_DIAMOND",
+    "blastp": "RAVEN_PYTHON_BLASTP",
+    "makeblastdb": "RAVEN_PYTHON_MAKEBLASTDB",
+    "hmmbuild": "RAVEN_PYTHON_HMMBUILD",
+    "hmmpress": "RAVEN_PYTHON_HMMPRESS",
+    "hmmsearch": "RAVEN_PYTHON_HMMSEARCH",
+    "hmmscan": "RAVEN_PYTHON_HMMSCAN",
+    "mafft": "RAVEN_PYTHON_MAFFT",
+    "cd-hit": "RAVEN_PYTHON_CDHIT",
+}
+
+
+def platform_key() -> str:
+    """Return the ``<os>-<arch>`` key used in the registry (e.g. ``linux-x86_64``)."""
+    system = {"linux": "linux", "darwin": "macos", "windows": "windows"}.get(
+        platform.system().lower(), platform.system().lower()
+    )
+    machine = platform.machine().lower()
+    arch = {"x86_64": "x86_64", "amd64": "x86_64", "arm64": "arm64", "aarch64": "arm64"}.get(
+        machine, machine
+    )
+    return f"{system}-{arch}"
+
+
+def _cache_dir() -> Path:
+    base = os.environ.get("XDG_CACHE_HOME") or (Path.home() / ".cache")
+    return Path(base) / "raven_python" / "binaries"
+
+
+def _bundle_for(executable: str, registry: dict):
+    for name, bundle in registry.items():
+        if executable in bundle.get("provides", []):
+            return name, bundle
+    return None, None
+
+
+def _sha256(path: Path) -> str:
+    h = hashlib.sha256()
+    with open(path, "rb") as fh:
+        for chunk in iter(lambda: fh.read(1 << 20), b""):
+            h.update(chunk)
+    return h.hexdigest()
+
+
+def ensure_binary(executable: str, *, registry: dict | None = None) -> Path:
+    """Download (if needed) and return the path to a bundled ``executable``.
+
+    Consults the registry for the current platform, downloads the pinned ZIP,
+    verifies its SHA256, extracts it into the cache, and returns the executable
+    path. Raises ``FileNotFoundError`` if no bundle for this platform is hosted.
+    """
+    registry = _REGISTRY if registry is None else registry
+    bundle_name, bundle = _bundle_for(executable, registry)
+    if bundle is None:
+        raise FileNotFoundError(
+            f"No bundled binary registered for {executable!r}. Install it (e.g. "
+            f"`conda install -c bioconda {executable}`) or pass an explicit path."
+        )
+    key = platform_key()
+    entry = bundle.get("platforms", {}).get(key)
+    if entry is None:
+        raise FileNotFoundError(
+            f"No bundled {executable!r} for platform {key!r}. Install it "
+            f"(e.g. `conda install -c bioconda {executable}`), set "
+            f"{_ENV_VARS.get(executable, 'the binary path')}, or pass binary=."
+        )
+
+    dest_dir = _cache_dir() / f"{bundle_name}-{bundle['version']}-{key}"
+    exe = dest_dir / executable
+    if exe.exists():
+        return exe
+
+    dest_dir.mkdir(parents=True, exist_ok=True)
+    archive = dest_dir / "_download.zip"
+    # Download into a sibling .part file and rename on success — an interrupted
+    # download leaves the partial behind .part, never as a half-complete .zip
+    # that a later run might mistake for a finished one. Mirrors data.py.
+    part = archive.with_suffix(archive.suffix + ".part")
+    try:
+        with urlopen(entry["url"]) as resp, open(part, "wb") as out:  # noqa: S310
+            shutil.copyfileobj(resp, out)
+        digest = _sha256(part)
+        if digest != entry["sha256"]:
+            raise ValueError(
+                f"SHA256 mismatch for {executable!r} ({key}): "
+                f"expected {entry['sha256']}, got {digest}."
+            )
+        os.replace(part, archive)
+    finally:
+        part.unlink(missing_ok=True)
+    with zipfile.ZipFile(archive) as zf:
+        zf.extractall(dest_dir)
+    archive.unlink(missing_ok=True)
+    if not exe.exists():
+        raise FileNotFoundError(f"{executable!r} not found in the extracted bundle at {dest_dir}.")
+    exe.chmod(0o755)
+    return exe
+
+
+def resolve_binary(executable: str, *, binary: str | os.PathLike | None = None) -> str:
+    """Resolve an executable to a path: arg → env var → PATH → bundled ZIP → error."""
+    if binary is not None:
+        return os.fspath(binary)
+    env_var = _ENV_VARS.get(executable)
+    if env_var and os.environ.get(env_var):
+        return os.environ[env_var]
+    found = shutil.which(executable)
+    if found:
+        return found
+    try:
+        return os.fspath(ensure_binary(executable))
+    except FileNotFoundError as exc:
+        raise FileNotFoundError(
+            f"Could not find {executable!r}. Install it (e.g. "
+            f"`conda install -c bioconda {executable}`), put it on PATH, set "
+            f"{env_var or 'the binary path'}, or pass binary=. ({exc})"
+        ) from exc
diff --git a/src/raven_python/data.py b/src/raven_python/data.py
new file mode 100644
index 0000000..b1264be
--- /dev/null
+++ b/src/raven_python/data.py
@@ -0,0 +1,135 @@
+"""Fetch and cache published data artefacts (KEGG reference model, tables, HMMs).
+
+The mirror of :mod:`raven_python.binaries` for *data*: a version-pinned registry of
+downloadable artefacts, fetched on first use, SHA256-verified, and cached under
+platformdirs so end users never rebuild them from a KEGG dump (that is the
+maintainer's job — see docs/maintaining_kegg_data.md).
+
+Resolution for any artefact file:
+
+    explicit local dir  →  cached copy  →  download from the registry (verify,
+        cache)  →  FileNotFoundError with guidance
+
+The registry is **empty until the artefacts are published** (same as
+``binaries._REGISTRY``); until then ``ensure_data_file`` raises an actionable
+error. Cache layout::
+
+    $XDG_CACHE_HOME/raven_python/data/<dataset>-<version>/<filename>
+    (or ~/.cache/raven_python/data/... if XDG_CACHE_HOME is unset)
+"""
+from __future__ import annotations
+
+import os
+import shutil
+from pathlib import Path
+from urllib.request import urlopen
+
+from raven_python.binaries import _sha256
+
+# dataset -> {"version": str, "files": {filename: {"url": str, "sha256": str}}}
+# Populated when raven_python publishes the KEGG artefacts as release assets.
+_DATA_REGISTRY: dict = {}
+
+# The core KEGG artefacts needed to build a model (no HMM libraries).
+CORE_KEGG_FILES = (
+    "reference_model.yml.gz",
+    "ko_reaction.tsv.gz",
+    "ko_names.tsv.gz",
+    "organism_gene_ko.tsv.xz",
+    "rxn_flags.tsv.gz",
+)
+
+
+def _data_cache_dir() -> Path:
+    base = os.environ.get("XDG_CACHE_HOME") or (Path.home() / ".cache")
+    return Path(base) / "raven_python" / "data"
+
+
+def _bundle(dataset: str, registry: dict) -> dict:
+    bundle = registry.get(dataset)
+    if bundle is None:
+        raise FileNotFoundError(
+            f"No data artefacts registered for {dataset!r}. Either pass a local "
+            f"directory of artefacts, or build them per docs/maintaining_kegg_data.md."
+        )
+    return bundle
+
+
+def ensure_data_file(
+    dataset: str,
+    filename: str,
+    *,
+    version: str | None = None,
+    registry: dict | None = None,
+) -> Path:
+    """Download (if needed) and return the cached path to one artefact file.
+
+    Looks the file up in the registry for ``dataset`` (at ``version`` or the
+    registry's default), downloads it to the version-pinned cache directory,
+    verifies its SHA256, and returns the path. Re-uses an already-cached copy.
+    """
+    registry = _DATA_REGISTRY if registry is None else registry
+    bundle = _bundle(dataset, registry)
+    ver = version or bundle["version"]
+    entry = bundle.get("files", {}).get(filename)
+    if entry is None:
+        raise FileNotFoundError(
+            f"{filename!r} is not registered for {dataset!r} {ver}. "
+            f"Available: {sorted(bundle.get('files', {}))}."
+        )
+
+    dest_dir = _data_cache_dir() / f"{dataset}-{ver}"
+    dest = dest_dir / filename
+    if dest.exists():
+        return dest
+
+    dest_dir.mkdir(parents=True, exist_ok=True)
+    tmp = dest.with_name(dest.name + ".part")
+    with urlopen(entry["url"]) as resp, open(tmp, "wb") as out:  # noqa: S310 (trusted registry URLs)
+        shutil.copyfileobj(resp, out)
+    digest = _sha256(tmp)
+    if digest != entry["sha256"]:
+        tmp.unlink(missing_ok=True)
+        raise ValueError(
+            f"SHA256 mismatch for {dataset}/{filename} ({ver}): "
+            f"expected {entry['sha256']}, got {digest}."
+        )
+    tmp.replace(dest)
+    return dest
+
+
+def ensure_kegg_data(
+    *,
+    version: str | None = None,
+    files: tuple[str, ...] = CORE_KEGG_FILES,
+    registry: dict | None = None,
+) -> Path:
+    """Ensure the core KEGG artefacts are cached; return their directory.
+
+    Fetches each of ``files`` (default :data:`CORE_KEGG_FILES`) for the ``kegg``
+    dataset and returns the cache directory holding them — ready to pass as the
+    ``artefact_dir`` of :func:`get_kegg_model_for_organism_from_artefacts`.
+    """
+    registry = _DATA_REGISTRY if registry is None else registry
+    ver = version or _bundle("kegg", registry)["version"]
+    for filename in files:
+        ensure_data_file("kegg", filename, version=ver, registry=registry)
+    return _data_cache_dir() / f"kegg-{ver}"
+
+
+def ensure_kegg_hmm_library(
+    domain: str, *, version: str | None = None, registry: dict | None = None
+) -> Path:
+    """Ensure a domain HMM library (and its hmmpress index) is cached; return its path.
+
+    ``domain`` is ``"prokaryotes"`` or ``"eukaryotes"``. Fetches ``<domain>.hmm``
+    plus the ``hmmpress`` sidecar files (``.h3f/.h3i/.h3m/.h3p``) and returns the
+    path to the ``.hmm`` (the argument for :func:`run_hmmscan`).
+    """
+    registry = _DATA_REGISTRY if registry is None else registry
+    ver = version or _bundle("kegg", registry)["version"]
+    base = f"{domain}.hmm"
+    library = ensure_data_file("kegg", base, version=ver, registry=registry)
+    for suffix in (".h3f", ".h3i", ".h3m", ".h3p"):
+        ensure_data_file("kegg", base + suffix, version=ver, registry=registry)
+    return library
diff --git a/tests/test_binaries.py b/tests/test_binaries.py
new file mode 100644
index 0000000..d74ce0b
--- /dev/null
+++ b/tests/test_binaries.py
@@ -0,0 +1,80 @@
+"""Tests for raven_python.binaries (binary resolution + bundled-ZIP provisioning)."""
+import hashlib
+import shutil
+import zipfile
+from pathlib import Path
+
+import pytest
+
+from raven_python import binaries
+
+
+def test_resolve_explicit_path():
+    assert binaries.resolve_binary("blastp", binary="/opt/x/blastp") == "/opt/x/blastp"
+
+
+def test_resolve_env_var(monkeypatch):
+    monkeypatch.setenv("RAVEN_PYTHON_DIAMOND", "/custom/diamond")
+    assert binaries.resolve_binary("diamond") == "/custom/diamond"
+
+
+@pytest.mark.skipif(not shutil.which("blastp"), reason="blastp not installed")
+def test_resolve_via_path():
+    assert binaries.resolve_binary("blastp") == shutil.which("blastp")
+
+
+def test_resolve_unresolvable_raises(monkeypatch):
+    monkeypatch.setattr(shutil, "which", lambda _: None)
+    with pytest.raises(FileNotFoundError, match="Could not find"):
+        binaries.resolve_binary("diamond")  # empty registry, not on PATH
+
+
+def test_platform_key_format():
+    key = binaries.platform_key()
+    assert "-" in key
+    os_part, arch = key.split("-", 1)
+    assert os_part in {"linux", "macos", "windows"} or os_part  # tolerant
+
+
+def test_ensure_binary_downloads_verifies_extracts(tmp_path, monkeypatch):
+    # Build a fake bundle ZIP containing an executable, served via file:// URL.
+    exe = tmp_path / "footool"
+    exe.write_text("#!/bin/sh\necho hi\n")
+    archive = tmp_path / "footool.zip"
+    with zipfile.ZipFile(archive, "w") as zf:
+        zf.write(exe, "footool")
+    sha = hashlib.sha256(archive.read_bytes()).hexdigest()
+
+    registry = {
+        "footool": {
+            "version": "1.0",
+            "provides": ["footool"],
+            "platforms": {binaries.platform_key(): {"url": archive.as_uri(), "sha256": sha}},
+        }
+    }
+    monkeypatch.setenv("XDG_CACHE_HOME", str(tmp_path / "cache"))
+
+    path = binaries.ensure_binary("footool", registry=registry)
+    assert Path(path).exists()
+    assert Path(path).name == "footool"
+    # cached on second call (same path, no re-download needed)
+    assert binaries.ensure_binary("footool", registry=registry) == path
+
+
+def test_ensure_binary_sha_mismatch(tmp_path, monkeypatch):
+    archive = tmp_path / "x.zip"
+    with zipfile.ZipFile(archive, "w") as zf:
+        zf.writestr("footool", "data")
+    registry = {
+        "footool": {"version": "1", "provides": ["footool"],
+                    "platforms": {binaries.platform_key(): {"url": archive.as_uri(), "sha256": "deadbeef"}}}
+    }
+    monkeypatch.setenv("XDG_CACHE_HOME", str(tmp_path / "cache"))
+    with pytest.raises(ValueError, match="SHA256 mismatch"):
+        binaries.ensure_binary("footool", registry=registry)
+
+
+def test_ensure_binary_unhosted_platform_raises(tmp_path):
+    registry = {"footool": {"version": "1", "provides": ["footool"], "platforms": {}}}
+    with pytest.raises(FileNotFoundError, match="No bundled"):
+        binaries.ensure_binary("footool", registry=registry)
diff --git a/tests/test_data.py b/tests/test_data.py
new file mode 100644
index 0000000..714c3a9
--- /dev/null
+++ b/tests/test_data.py
@@ -0,0 +1,89 @@
+"""Tests for ensure_data (data.py). Uses file:// URLs to avoid the network."""
+import hashlib
+
+import pytest
+
+from raven_python.data import (
+    CORE_KEGG_FILES,
+    ensure_data_file,
+    ensure_kegg_data,
+)
+
+
+def _sha256(data: bytes) -> str:
+    return hashlib.sha256(data).hexdigest()
+
+
+@pytest.fixture
+def served(tmp_path, monkeypatch):
+    """A fake registry served from local files, with the cache pointed at tmp."""
+    src = tmp_path / "src"
+    src.mkdir()
+    payloads = {
+        "reference_model.yml.gz": b"!!omap model bytes",
+        "ko_reaction.tsv.gz": b"ko\treaction\n",
+        "ko_names.tsv.gz": b"ko\tname\n",
+        "organism_gene_ko.tsv.xz": b"organism\tgene\tko\n",
+        "rxn_flags.tsv.gz": b"reaction\tspontaneous\n",
+    }
+    files = {}
+    for name, data in payloads.items():
+        path = src / name
+        path.write_bytes(data)
+        files[name] = {"url": path.as_uri(), "sha256": _sha256(data)}
+    registry = {"kegg": {"version": "v1", "files": files}}
+
+    cache = tmp_path / "cache"
+    monkeypatch.setenv("XDG_CACHE_HOME", str(cache))
+    return registry, cache, payloads
+
+
+def test_ensure_data_file_downloads_and_caches(served):
+    registry, cache, payloads = served
+    path = ensure_data_file("kegg", "ko_reaction.tsv.gz", registry=registry)
+    assert path == cache / "raven_python" / "data" / "kegg-v1" / "ko_reaction.tsv.gz"
+    assert path.read_bytes() == payloads["ko_reaction.tsv.gz"]
+
+
+def test_ensure_data_file_reuses_cache(served, monkeypatch):
+    registry, _, _ = served
+    first = ensure_data_file("kegg", "ko_names.tsv.gz", registry=registry)
+    # Break the URL: a second call must hit the cache, not re-download.
+    registry["kegg"]["files"]["ko_names.tsv.gz"]["url"] = "file:///nonexistent"
+    second = ensure_data_file("kegg", "ko_names.tsv.gz", registry=registry)
+    assert first == second and second.exists()
+
+
+def test_sha256_mismatch_rejected(served):
+    registry, cache, _ = served
+    registry["kegg"]["files"]["rxn_flags.tsv.gz"]["sha256"] = "0" * 64
+    with pytest.raises(ValueError, match="SHA256 mismatch"):
+        ensure_data_file("kegg", "rxn_flags.tsv.gz", registry=registry)
+    # The corrupt partial download must not be left behind.
+    assert not (cache / "raven_python" / "data" / "kegg-v1" / "rxn_flags.tsv.gz").exists()
+
+
+def test_unknown_dataset_actionable_error(served):
+    registry, _, _ = served
+    with pytest.raises(FileNotFoundError, match="No data artefacts registered"):
+        ensure_data_file("metacyc", "x", registry=registry)
+
+
+def test_unknown_file_lists_available(served):
+    registry, _, _ = served
+    with pytest.raises(FileNotFoundError, match="not registered"):
+        ensure_data_file("kegg", "missing.tsv.gz", registry=registry)
+
+
+def test_ensure_kegg_data_fetches_core_set(served):
+    registry, cache, _ = served
+    out = ensure_kegg_data(registry=registry)
+    assert out == cache / "raven_python" / "data" / "kegg-v1"
+    for name in CORE_KEGG_FILES:
+        assert (out / name).is_file()
+
+
+def test_empty_registry_raises():
+    # The shipped registry is empty until artefacts are published.
+    with pytest.raises(FileNotFoundError, match="No data artefacts registered"):
+        ensure_data_file("kegg", "ko_reaction.tsv.gz")
diff --git a/tests/test_scripts_registry.py b/tests/test_scripts_registry.py
new file mode 100644
index 0000000..c9c03cf
--- /dev/null
+++ b/tests/test_scripts_registry.py
@@ -0,0 +1,58 @@
+"""Tests for scripts/make_registry_snippet.py registry-entry helpers."""
+import hashlib
+import importlib.util
+import json
+from pathlib import Path
+
+import pytest
+
+# scripts/ is not a package; load the module directly by path.
+_SCRIPT = Path(__file__).resolve().parents[1] / "scripts" / "make_registry_snippet.py"
+_spec = importlib.util.spec_from_file_location("make_registry_snippet", _SCRIPT)
+mrs = importlib.util.module_from_spec(_spec)
+_spec.loader.exec_module(mrs)
+
+
+def _sha(data: bytes) -> str:
+    return hashlib.sha256(data).hexdigest()
+
+
+def test_data_entry_lists_files_with_urls_and_checksums(tmp_path):
+    (tmp_path / "reference_model.yml.gz").write_bytes(b"model")
+    (tmp_path / "ko_reaction.tsv.gz").write_bytes(b"table")
+    (tmp_path / ".hidden").write_bytes(b"skip")  # hidden files ignored
+
+    entry = mrs.data_entry("kegg", "kegg116", "https://x/rel/", tmp_path)
+    assert entry["version"] == "kegg116"
+    assert set(entry["files"]) == {"reference_model.yml.gz", "ko_reaction.tsv.gz"}
+    ref = entry["files"]["reference_model.yml.gz"]
+    assert ref["url"] == "https://x/rel/reference_model.yml.gz"  # trailing slash collapsed
+    assert ref["sha256"] == _sha(b"model")
+
+
+def test_data_entry_empty_dir_errors(tmp_path):
+    with pytest.raises(SystemExit):
+        mrs.data_entry("kegg", "v1", "https://x", tmp_path)
+
+
+def test_binary_entry_parses_platform_from_filename(tmp_path):
+    (tmp_path / "blast-2.16.0-linux-x86_64.zip").write_bytes(b"linux")
+    (tmp_path / "blast-2.16.0-macos-arm64.zip").write_bytes(b"mac")
+    (tmp_path / "other-1.0-linux-x86_64.zip").write_bytes(b"nope")  # different bundle
+
+    entry = mrs.binary_entry("blast", "2.16.0", ["blastp", "makeblastdb"], "https://x", tmp_path)
+    assert entry["provides"] == ["blastp", "makeblastdb"]
+    assert set(entry["platforms"]) == {"linux-x86_64", "macos-arm64"}
+    assert entry["platforms"]["macos-arm64"]["sha256"] == _sha(b"mac")
+    assert entry["platforms"]["linux-x86_64"]["url"].endswith("blast-2.16.0-linux-x86_64.zip")
+
+
+def test_binary_entry_no_zips_errors(tmp_path):
+    with pytest.raises(SystemExit):
+        mrs.binary_entry("blast", "2.16.0", ["blastp"], "https://x", tmp_path)
+
+
+def test_render_is_valid_json_round_trip():
+    entry = {"version": "v1", "files": {"a": {"url": "u", "sha256": "s"}}}
+    text = mrs.render("kegg", entry)
+    assert json.loads(text) == {"kegg": entry}