From 0a9f4a1f20f1503b551273ffdbba2a25a00e9f72 Mon Sep 17 00:00:00 2001
From: Eduard Kerkhoven <eduardk@chalmers.se>
Date: Fri, 29 May 2026 22:45:40 +0200
Subject: [PATCH 1/7] Project scaffold: pyproject + package skeleton + README +
 LICENSE

---
 LICENSE                      | 541 +++++++++++++++++++++++++++++++++++
 README.md                    |  93 +++++-
 pyproject.toml               |  84 ++++++
 src/raven_python/__init__.py |  10 +
 4 files changed, 715 insertions(+), 13 deletions(-)
 create mode 100644 LICENSE
 create mode 100644 pyproject.toml
 create mode 100644 src/raven_python/__init__.py

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..1f44cf3
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,541 @@
+GNU GENERAL PUBLIC LICENSE
+==========================
+
+Version 3, 29 June 2007
+
+Copyright &copy; 2007 Free Software Foundation, Inc. &lt;<http://fsf.org/>&gt;
+
+Everyone is permitted to copy and distribute verbatim copies of this license
+document, but changing it is not allowed.
+
+## Preamble
+
+The GNU General Public License is a free, copyleft license for software and other
+kinds of works.
+
+The licenses for most software and other practical works are designed to take away
+your freedom to share and change the works. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change all versions of a
+program--to make sure it remains free software for all its users. We, the Free
+Software Foundation, use the GNU General Public License for most of our software; it
+applies also to any other work released this way by its authors. You can apply it to
+your programs, too.
+
+When we speak of free software, we are referring to freedom, not price. Our General
+Public Licenses are designed to make sure that you have the freedom to distribute
+copies of free software (and charge for them if you wish), that you receive source
+code or can get it if you want it, that you can change the software or use pieces of
+it in new free programs, and that you know you can do these things.
+
+To protect your rights, we need to prevent others from denying you these rights or
+asking you to surrender the rights. Therefore, you have certain responsibilities if
+you distribute copies of the software, or if you modify it: responsibilities to
+respect the freedom of others.
+
+For example, if you distribute copies of such a program, whether gratis or for a fee,
+you must pass on to the recipients the same freedoms that you received. You must make
+sure that they, too, receive or can get the source code. And you must show them these
+terms so they know their rights.
+
+Developers that use the GNU GPL protect your rights with two steps: (1) assert
+copyright on the software, and (2) offer you this License giving you legal permission
+to copy, distribute and/or modify it.
+
+For the developers' and authors' protection, the GPL clearly explains that there is
+no warranty for this free software. For both users' and authors' sake, the GPL
+requires that modified versions be marked as changed, so that their problems will not
+be attributed erroneously to authors of previous versions.
+
+Some devices are designed to deny users access to install or run modified versions of
+the software inside them, although the manufacturer can do so. This is fundamentally
+incompatible with the aim of protecting users' freedom to change the software. The
+systematic pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable. Therefore, we have designed
+this version of the GPL to prohibit the practice for those products. If such problems
+arise substantially in other domains, we stand ready to extend this provision to
+those domains in future versions of the GPL, as needed to protect the freedom of
+users.
+
+Finally, every program is threatened constantly by software patents. States should
+not allow patents to restrict development and use of software on general-purpose
+computers, but in those that do, we wish to avoid the special danger that patents
+applied to a free program could make it effectively proprietary. To prevent this, the
+GPL assures that patents cannot be used to render the program non-free.
+
+The precise terms and conditions for copying, distribution and modification follow.
+
+## TERMS AND CONDITIONS
+
+### 0. Definitions.
+
+&ldquo;This License&rdquo; refers to version 3 of the GNU General Public License.
+
+&ldquo;Copyright&rdquo; also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+&ldquo;The Program&rdquo; refers to any copyrightable work licensed under this
+License. Each licensee is addressed as &ldquo;you&rdquo;. &ldquo;Licensees&rdquo; and
+&ldquo;recipients&rdquo; may be individuals or organizations.
+
+To &ldquo;modify&rdquo; a work means to copy from or adapt all or part of the work in
+a fashion requiring copyright permission, other than the making of an exact copy. The
+resulting work is called a &ldquo;modified version&rdquo; of the earlier work or a
+work &ldquo;based on&rdquo; the earlier work.
+
+A &ldquo;covered work&rdquo; means either the unmodified Program or a work based on
+the Program.
+
+To &ldquo;propagate&rdquo; a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for infringement under
+applicable copyright law, except executing it on a computer or modifying a private
+copy. Propagation includes copying, distribution (with or without modification),
+making available to the public, and in some countries other activities as well.
+
+To &ldquo;convey&rdquo; a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through a computer
+network, with no transfer of a copy, is not conveying.
+
+An interactive user interface displays &ldquo;Appropriate Legal Notices&rdquo; to the
+extent that it includes a convenient and prominently visible feature that (1)
+displays an appropriate copyright notice, and (2) tells the user that there is no
+warranty for the work (except to the extent that warranties are provided), that
+licensees may convey the work under this License, and how to view a copy of this
+License. If the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+### 1. Source Code.
+
+The &ldquo;source code&rdquo; for a work means the preferred form of the work for
+making modifications to it. &ldquo;Object code&rdquo; means any non-source form of a
+work.
+
+A &ldquo;Standard Interface&rdquo; means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of interfaces
+specified for a particular programming language, one that is widely used among
+developers working in that language.
+
+The &ldquo;System Libraries&rdquo; of an executable work include anything, other than
+the work as a whole, that (a) is included in the normal form of packaging a Major
+Component, but which is not part of that Major Component, and (b) serves only to
+enable use of the work with that Major Component, or to implement a Standard
+Interface for which an implementation is available to the public in source code form.
+A &ldquo;Major Component&rdquo;, in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system (if any) on which
+the executable work runs, or a compiler used to produce the work, or an object code
+interpreter used to run it.
+
+The &ldquo;Corresponding Source&rdquo; for a work in object code form means all the
+source code needed to generate, install, and (for an executable work) run the object
+code and to modify the work, including scripts to control those activities. However,
+it does not include the work's System Libraries, or general-purpose tools or
+generally available free programs which are used unmodified in performing those
+activities but which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for the work, and
+the source code for shared libraries and dynamically linked subprograms that the work
+is specifically designed to require, such as by intimate data communication or
+control flow between those subprograms and other parts of the work.
+
+The Corresponding Source need not include anything that users can regenerate
+automatically from other parts of the Corresponding Source.
+
+The Corresponding Source for a work in source code form is that same work.
+
+### 2. Basic Permissions.
+
+All rights granted under this License are granted for the term of copyright on the
+Program, and are irrevocable provided the stated conditions are met. This License
+explicitly affirms your unlimited permission to run the unmodified Program. The
+output from running a covered work is covered by this License only if the output,
+given its content, constitutes a covered work. This License acknowledges your rights
+of fair use or other equivalent, as provided by copyright law.
+
+You may make, run and propagate covered works that you do not convey, without
+conditions so long as your license otherwise remains in force. You may convey covered
+works to others for the sole purpose of having them make modifications exclusively
+for you, or provide you with facilities for running those works, provided that you
+comply with the terms of this License in conveying all material for which you do not
+control copyright. Those thus making or running the covered works for you must do so
+exclusively on your behalf, under your direction and control, on terms that prohibit
+them from making any copies of your copyrighted material outside their relationship
+with you.
+
+Conveying under any other circumstances is permitted solely under the conditions
+stated below. Sublicensing is not allowed; section 10 makes it unnecessary.
+
+### 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+No covered work shall be deemed part of an effective technological measure under any
+applicable law fulfilling obligations under article 11 of the WIPO copyright treaty
+adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention
+of such measures.
+
+When you convey a covered work, you waive any legal power to forbid circumvention of
+technological measures to the extent such circumvention is effected by exercising
+rights under this License with respect to the covered work, and you disclaim any
+intention to limit operation or modification of the work as a means of enforcing,
+against the work's users, your or third parties' legal rights to forbid circumvention
+of technological measures.
+
+### 4. Conveying Verbatim Copies.
+
+You may convey verbatim copies of the Program's source code as you receive it, in any
+medium, provided that you conspicuously and appropriately publish on each copy an
+appropriate copyright notice; keep intact all notices stating that this License and
+any non-permissive terms added in accord with section 7 apply to the code; keep
+intact all notices of the absence of any warranty; and give all recipients a copy of
+this License along with the Program.
+
+You may charge any price or no price for each copy that you convey, and you may offer
+support or warranty protection for a fee.
+
+### 5. Conveying Modified Source Versions.
+
+You may convey a work based on the Program, or the modifications to produce it from
+the Program, in the form of source code under the terms of section 4, provided that
+you also meet all of these conditions:
+
+* **a)** The work must carry prominent notices stating that you modified it, and giving a
+relevant date.
+* **b)** The work must carry prominent notices stating that it is released under this
+License and any conditions added under section 7. This requirement modifies the
+requirement in section 4 to &ldquo;keep intact all notices&rdquo;.
+* **c)** You must license the entire work, as a whole, under this License to anyone who
+comes into possession of a copy. This License will therefore apply, along with any
+applicable section 7 additional terms, to the whole of the work, and all its parts,
+regardless of how they are packaged. This License gives no permission to license the
+work in any other way, but it does not invalidate such permission if you have
+separately received it.
+* **d)** If the work has interactive user interfaces, each must display Appropriate Legal
+Notices; however, if the Program has interactive interfaces that do not display
+Appropriate Legal Notices, your work need not make them do so.
+
+A compilation of a covered work with other separate and independent works, which are
+not by their nature extensions of the covered work, and which are not combined with
+it such as to form a larger program, in or on a volume of a storage or distribution
+medium, is called an &ldquo;aggregate&rdquo; if the compilation and its resulting
+copyright are not used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work in an aggregate
+does not cause this License to apply to the other parts of the aggregate.
+
+### 6. Conveying Non-Source Forms.
+
+You may convey a covered work in object code form under the terms of sections 4 and
+5, provided that you also convey the machine-readable Corresponding Source under the
+terms of this License, in one of these ways:
+
+* **a)** Convey the object code in, or embodied in, a physical product (including a
+physical distribution medium), accompanied by the Corresponding Source fixed on a
+durable physical medium customarily used for software interchange.
+* **b)** Convey the object code in, or embodied in, a physical product (including a
+physical distribution medium), accompanied by a written offer, valid for at least
+three years and valid for as long as you offer spare parts or customer support for
+that product model, to give anyone who possesses the object code either (1) a copy of
+the Corresponding Source for all the software in the product that is covered by this
+License, on a durable physical medium customarily used for software interchange, for
+a price no more than your reasonable cost of physically performing this conveying of
+source, or (2) access to copy the Corresponding Source from a network server at no
+charge.
+* **c)** Convey individual copies of the object code with a copy of the written offer to
+provide the Corresponding Source. This alternative is allowed only occasionally and
+noncommercially, and only if you received the object code with such an offer, in
+accord with subsection 6b.
+* **d)** Convey the object code by offering access from a designated place (gratis or for
+a charge), and offer equivalent access to the Corresponding Source in the same way
+through the same place at no further charge. You need not require recipients to copy
+the Corresponding Source along with the object code. If the place to copy the object
+code is a network server, the Corresponding Source may be on a different server
+(operated by you or a third party) that supports equivalent copying facilities,
+provided you maintain clear directions next to the object code saying where to find
+the Corresponding Source. Regardless of what server hosts the Corresponding Source,
+you remain obligated to ensure that it is available for as long as needed to satisfy
+these requirements.
+* **e)** Convey the object code using peer-to-peer transmission, provided you inform
+other peers where the object code and Corresponding Source of the work are being
+offered to the general public at no charge under subsection 6d.
+
+A separable portion of the object code, whose source code is excluded from the
+Corresponding Source as a System Library, need not be included in conveying the
+object code work.
+
+A &ldquo;User Product&rdquo; is either (1) a &ldquo;consumer product&rdquo;, which
+means any tangible personal property which is normally used for personal, family, or
+household purposes, or (2) anything designed or sold for incorporation into a
+dwelling. In determining whether a product is a consumer product, doubtful cases
+shall be resolved in favor of coverage. For a particular product received by a
+particular user, &ldquo;normally used&rdquo; refers to a typical or common use of
+that class of product, regardless of the status of the particular user or of the way
+in which the particular user actually uses, or expects or is expected to use, the
+product. A product is a consumer product regardless of whether the product has
+substantial commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+&ldquo;Installation Information&rdquo; for a User Product means any methods,
+procedures, authorization keys, or other information required to install and execute
+modified versions of a covered work in that User Product from a modified version of
+its Corresponding Source. The information must suffice to ensure that the continued
+functioning of the modified object code is in no case prevented or interfered with
+solely because modification has been made.
+
+If you convey an object code work under this section in, or with, or specifically for
+use in, a User Product, and the conveying occurs as part of a transaction in which
+the right of possession and use of the User Product is transferred to the recipient
+in perpetuity or for a fixed term (regardless of how the transaction is
+characterized), the Corresponding Source conveyed under this section must be
+accompanied by the Installation Information. But this requirement does not apply if
+neither you nor any third party retains the ability to install modified object code
+on the User Product (for example, the work has been installed in ROM).
+
+The requirement to provide Installation Information does not include a requirement to
+continue to provide support service, warranty, or updates for a work that has been
+modified or installed by the recipient, or for the User Product in which it has been
+modified or installed. Access to a network may be denied when the modification itself
+materially and adversely affects the operation of the network or violates the rules
+and protocols for communication across the network.
+
+Corresponding Source conveyed, and Installation Information provided, in accord with
+this section must be in a format that is publicly documented (and with an
+implementation available to the public in source code form), and must require no
+special password or key for unpacking, reading or copying.
+
+### 7. Additional Terms.
+
+&ldquo;Additional permissions&rdquo; are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions. Additional
+permissions that are applicable to the entire Program shall be treated as though they
+were included in this License, to the extent that they are valid under applicable
+law. If additional permissions apply only to part of the Program, that part may be
+used separately under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+When you convey a copy of a covered work, you may at your option remove any
+additional permissions from that copy, or from any part of it. (Additional
+permissions may be written to require their own removal in certain cases when you
+modify the work.) You may place additional permissions on material, added by you to a
+covered work, for which you have or can give appropriate copyright permission.
+
+Notwithstanding any other provision of this License, for material you add to a
+covered work, you may (if authorized by the copyright holders of that material)
+supplement the terms of this License with terms:
+
+* **a)** Disclaiming warranty or limiting liability differently from the terms of
+sections 15 and 16 of this License; or
+* **b)** Requiring preservation of specified reasonable legal notices or author
+attributions in that material or in the Appropriate Legal Notices displayed by works
+containing it; or
+* **c)** Prohibiting misrepresentation of the origin of that material, or requiring that
+modified versions of such material be marked in reasonable ways as different from the
+original version; or
+* **d)** Limiting the use for publicity purposes of names of licensors or authors of the
+material; or
+* **e)** Declining to grant rights under trademark law for use of some trade names,
+trademarks, or service marks; or
+* **f)** Requiring indemnification of licensors and authors of that material by anyone
+who conveys the material (or modified versions of it) with contractual assumptions of
+liability to the recipient, for any liability that these contractual assumptions
+directly impose on those licensors and authors.
+
+All other non-permissive additional terms are considered &ldquo;further
+restrictions&rdquo; within the meaning of section 10. If the Program as you received
+it, or any part of it, contains a notice stating that it is governed by this License
+along with a term that is a further restriction, you may remove that term. If a
+license document contains a further restriction but permits relicensing or conveying
+under this License, you may add to a covered work material governed by the terms of
+that license document, provided that the further restriction does not survive such
+relicensing or conveying.
+
+If you add terms to a covered work in accord with this section, you must place, in
+the relevant source files, a statement of the additional terms that apply to those
+files, or a notice indicating where to find the applicable terms.
+
+Additional terms, permissive or non-permissive, may be stated in the form of a
+separately written license, or stated as exceptions; the above requirements apply
+either way.
+
+### 8. Termination.
+
+You may not propagate or modify a covered work except as expressly provided under
+this License. Any attempt otherwise to propagate or modify it is void, and will
+automatically terminate your rights under this License (including any patent licenses
+granted under the third paragraph of section 11).
+
+However, if you cease all violation of this License, then your license from a
+particular copyright holder is reinstated (a) provisionally, unless and until the
+copyright holder explicitly and finally terminates your license, and (b) permanently,
+if the copyright holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+Moreover, your license from a particular copyright holder is reinstated permanently
+if the copyright holder notifies you of the violation by some reasonable means, this
+is the first time you have received notice of violation of this License (for any
+work) from that copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+Termination of your rights under this section does not terminate the licenses of
+parties who have received copies or rights from you under this License. If your
+rights have been terminated and not permanently reinstated, you do not qualify to
+receive new licenses for the same material under section 10.
+
+### 9. Acceptance Not Required for Having Copies.
+
+You are not required to accept this License in order to receive or run a copy of the
+Program. Ancillary propagation of a covered work occurring solely as a consequence of
+using peer-to-peer transmission to receive a copy likewise does not require
+acceptance. However, nothing other than this License grants you permission to
+propagate or modify any covered work. These actions infringe copyright if you do not
+accept this License. Therefore, by modifying or propagating a covered work, you
+indicate your acceptance of this License to do so.
+
+### 10. Automatic Licensing of Downstream Recipients.
+
+Each time you convey a covered work, the recipient automatically receives a license
+from the original licensors, to run, modify and propagate that work, subject to this
+License. You are not responsible for enforcing compliance by third parties with this
+License.
+
+An &ldquo;entity transaction&rdquo; is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an organization, or
+merging organizations. If propagation of a covered work results from an entity
+transaction, each party to that transaction who receives a copy of the work also
+receives whatever licenses to the work the party's predecessor in interest had or
+could give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if the predecessor
+has it or can get it with reasonable efforts.
+
+You may not impose any further restrictions on the exercise of the rights granted or
+affirmed under this License. For example, you may not impose a license fee, royalty,
+or other charge for exercise of rights granted under this License, and you may not
+initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging
+that any patent claim is infringed by making, using, selling, offering for sale, or
+importing the Program or any portion of it.
+
+### 11. Patents.
+
+A &ldquo;contributor&rdquo; is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The work thus
+licensed is called the contributor's &ldquo;contributor version&rdquo;.
+
+A contributor's &ldquo;essential patent claims&rdquo; are all patent claims owned or
+controlled by the contributor, whether already acquired or hereafter acquired, that
+would be infringed by some manner, permitted by this License, of making, using, or
+selling its contributor version, but do not include claims that would be infringed
+only as a consequence of further modification of the contributor version. For
+purposes of this definition, &ldquo;control&rdquo; includes the right to grant patent
+sublicenses in a manner consistent with the requirements of this License.
+
+Each contributor grants you a non-exclusive, worldwide, royalty-free patent license
+under the contributor's essential patent claims, to make, use, sell, offer for sale,
+import and otherwise run, modify and propagate the contents of its contributor
+version.
+
+In the following three paragraphs, a &ldquo;patent license&rdquo; is any express
+agreement or commitment, however denominated, not to enforce a patent (such as an
+express permission to practice a patent or covenant not to sue for patent
+infringement). To &ldquo;grant&rdquo; such a patent license to a party means to make
+such an agreement or commitment not to enforce a patent against the party.
+
+If you convey a covered work, knowingly relying on a patent license, and the
+Corresponding Source of the work is not available for anyone to copy, free of charge
+and under the terms of this License, through a publicly available network server or
+other readily accessible means, then you must either (1) cause the Corresponding
+Source to be so available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner consistent with
+the requirements of this License, to extend the patent license to downstream
+recipients. &ldquo;Knowingly relying&rdquo; means you have actual knowledge that, but
+for the patent license, your conveying the covered work in a country, or your
+recipient's use of the covered work in a country, would infringe one or more
+identifiable patents in that country that you have reason to believe are valid.
+
+If, pursuant to or in connection with a single transaction or arrangement, you
+convey, or propagate by procuring conveyance of, a covered work, and grant a patent
+license to some of the parties receiving the covered work authorizing them to use,
+propagate, modify or convey a specific copy of the covered work, then the patent
+license you grant is automatically extended to all recipients of the covered work and
+works based on it.
+
+A patent license is &ldquo;discriminatory&rdquo; if it does not include within the
+scope of its coverage, prohibits the exercise of, or is conditioned on the
+non-exercise of one or more of the rights that are specifically granted under this
+License. You may not convey a covered work if you are a party to an arrangement with
+a third party that is in the business of distributing software, under which you make
+payment to the third party based on the extent of your activity of conveying the
+work, and under which the third party grants, to any of the parties who would receive
+the covered work from you, a discriminatory patent license (a) in connection with
+copies of the covered work conveyed by you (or copies made from those copies), or (b)
+primarily for and in connection with specific products or compilations that contain
+the covered work, unless you entered into that arrangement, or that patent license
+was granted, prior to 28 March 2007.
+
+Nothing in this License shall be construed as excluding or limiting any implied
+license or other defenses to infringement that may otherwise be available to you
+under applicable patent law.
+
+### 12. No Surrender of Others' Freedom.
+
+If conditions are imposed on you (whether by court order, agreement or otherwise)
+that contradict the conditions of this License, they do not excuse you from the
+conditions of this License. If you cannot convey a covered work so as to satisfy
+simultaneously your obligations under this License and any other pertinent
+obligations, then as a consequence you may not convey it at all. For example, if you
+agree to terms that obligate you to collect a royalty for further conveying from
+those to whom you convey the Program, the only way you could satisfy both those terms
+and this License would be to refrain entirely from conveying the Program.
+
+### 13. Use with the GNU Affero General Public License.
+
+Notwithstanding any other provision of this License, you have permission to link or
+combine any covered work with a work licensed under version 3 of the GNU Affero
+General Public License into a single combined work, and to convey the resulting work.
+The terms of this License will continue to apply to the part which is the covered
+work, but the special requirements of the GNU Affero General Public License, section
+13, concerning interaction through a network will apply to the combination as such.
+
+### 14. Revised Versions of this License.
+
+The Free Software Foundation may publish revised and/or new versions of the GNU
+General Public License from time to time. Such new versions will be similar in spirit
+to the present version, but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program specifies that
+a certain numbered version of the GNU General Public License &ldquo;or any later
+version&rdquo; applies to it, you have the option of following the terms and
+conditions either of that numbered version or of any later version published by the
+Free Software Foundation. If the Program does not specify a version number of the GNU
+General Public License, you may choose any version ever published by the Free
+Software Foundation.
+
+If the Program specifies that a proxy can decide which future versions of the GNU
+General Public License can be used, that proxy's public statement of acceptance of a
+version permanently authorizes you to choose that version for the Program.
+
+Later license versions may give you additional or different permissions. However, no
+additional obligations are imposed on any author or copyright holder as a result of
+your choosing to follow a later version.
+
+### 15. Disclaimer of Warranty.
+
+THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM &ldquo;AS IS&rdquo; WITHOUT WARRANTY OF ANY KIND, EITHER
+EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE
+QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE
+DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+### 16. Limitation of Liability.
+
+IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY
+COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS
+PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL,
+INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE
+OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE
+WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+### 17. Interpretation of Sections 15 and 16.
+
+If the disclaimer of warranty and limitation of liability provided above cannot be
+given local legal effect according to their terms, reviewing courts shall apply local
+law that most closely approximates an absolute waiver of all civil liability in
+connection with the Program, unless a warranty or assumption of liability accompanies
+a copy of the Program in return for a fee.
\ No newline at end of file
diff --git a/README.md b/README.md
index c8f3d64..69111be 100644
--- a/README.md
+++ b/README.md
@@ -1,15 +1,82 @@
 # raven-python
 
-The Python counterpart of the
-[RAVEN Toolbox 2](https://github.com/SysBioChalmers/RAVEN) (MATLAB), built on
-[cobrapy](https://github.com/opencobra/cobrapy).
-
-`raven-python` covers de-novo reconstruction (KEGG + protein homology),
-context-specific model extraction (`tINIT` / `ftINIT`), metabolic-task
-validation, gap-filling, omics ingestion, sub-cellular localisation, model
-manipulation, and YAML / SIF / Excel I/O — preserving the established RAVEN
-workflows in a Python-native form.
-
-This `main` branch is intentionally empty. Development happens on the
-`develop` branch via a series of feature branches; see the open and merged
-pull requests for the current state of the port.
+[![CI](https://github.com/SysBioChalmers/raven-python/actions/workflows/ci.yml/badge.svg)](https://github.com/SysBioChalmers/raven-python/actions/workflows/ci.yml)
+
+**Reconstruction, Analysis and Visualisation of Metabolic Networks — in Python.**
+
+`raven-python` is the Python counterpart of the
+[RAVEN Toolbox 2](https://github.com/SysBioChalmers/RAVEN) (MATLAB). It builds on
+[**cobrapy**](https://github.com/opencobra/cobrapy) for everything cobrapy already does
+well (simulation, standard analyses, SBML I/O, model manipulation) and adds the
+functionality that's unique to RAVEN:
+
+* **De novo reconstruction** from KEGG and protein homology (BLAST / DIAMOND).
+* **Context-specific models** from omics data via **tINIT / ftINIT**, with task-aware
+  gap-filling and the linear-merge MILP reduction.
+* **Metabolic-task** validation (`check_tasks`, `fitTasks`).
+* **Connectivity gap-filling** against template models.
+* **Omics integration** — Human Protein Atlas (proteomics + RNA-seq) ingestion.
+* **Sub-cellular localisation** prediction by MILP, with partial-update mode and
+  pluggable predictors (WoLF PSORT, DeepLoc, …).
+* **N-model comparison**; **reporter metabolites**; **FSEOF**; **flux sampling**.
+* **YAML I/O** following the cobra standard, plus geckopy's `ec-*` enzyme-constrained
+  fields. **SIF** export. **RAVEN-style Excel** export.
+
+The status of every RAVEN function (ported, cheatsheet-mapped to cobra, or explicitly
+not ported) is documented function-by-function in
+**[docs/raven_migration.md](docs/raven_migration.md)**.
+
+## Design principle
+
+The canonical in-memory object is always a [`cobra.Model`](https://cobrapy.readthedocs.io).
+There is no parallel RAVEN struct, no `ravenCobraWrapper`-style adapter. RAVEN-specific
+fields that cobra doesn't model natively (`rxnMiriams`, `metDeltaG`,
+`rxnConfidenceScores`, …) live in cobra's `annotation` / `notes` dictionaries. This
+avoids duplicating cobra's data model and keeps raven-python interoperable with the wider
+COBRA ecosystem.
+
+## Status
+
+raven-python has been validated against MATLAB RAVEN on **Human-GEM** (5 Hart2015 cell-line
+models, Jaccard 0.975–0.980 — see [docs/humangem_validation.md](docs/humangem_validation.md)).
+The functional scope of the original RAVEN toolbox is covered with two principled
+omissions:
+
+* **MetaCyc-based reconstruction** is not implemented and is flagged for removal from
+  MATLAB RAVEN as well — see [IMPROVEMENTS.md](IMPROVEMENTS.md) under `R-MetaCyc`.
+* **Dynamic FBA** is not implemented — several maintained Python packages already cover
+  it ([`dfba`](https://pypi.org/project/dfba/), [`reframed`](https://pypi.org/project/reframed/),
+  [`mewpy`](https://pypi.org/project/mewpy/)).
+
+What's still open is catalogued in **[docs/todo.md](docs/todo.md)** (visualisation / Phase
+6 is the main item).
+
+## Installation (development)
+
+```bash
+git clone https://github.com/SysBioChalmers/raven-python
+cd raven-python
+pip install -e ".[dev]"
+```
+
+raven-python requires Python ≥ 3.11. Genome-scale (f)tINIT MILPs currently require **Gurobi**
+([details on solver portability](docs/init_solver_benchmark.md)); toy and unit-test work
+runs on the open-source GLPK.
+
+## Documentation
+
+See **[docs/README.md](docs/README.md)** for the documentation index.
+
+## Relationship to MATLAB RAVEN
+
+`raven-python` is a derivative work and is released under the same **GPL-3.0-or-later**
+license. If you use it in scientific work, please cite the RAVEN 2 paper:
+
+> Wang H, Marcišauskas S, Sánchez BJ, Domenzain I, Hermansson D, Agren R, Nielsen J,
+> Kerkhoven EJ. (2018) RAVEN 2.0: A versatile toolbox for metabolic network
+> reconstruction and a case study on *Streptomyces coelicolor*. PLoS Comput Biol 14(10):
+> e1006541.
+
+## License
+
+[GPL-3.0-or-later](LICENSE)
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..faeeb1f
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,84 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "raven-python"
+version = "0.0.1"
+description = "Reconstruction, Analysis and Visualization of Metabolic Networks in Python, a port of the RAVEN Toolbox built on cobrapy"
+readme = "README.md"
+requires-python = ">=3.11"
+license = { text = "GPL-3.0-or-later" }
+authors = [
+    { name = "Eduard Kerkhoven", email = "eduardk@chalmers.se" },
+]
+keywords = [
+    "genome-scale-model",
+    "metabolic-model",
+    "reconstruction",
+    "raven",
+    "cobra",
+    "systems-biology",
+    "constraint-based-modeling",
+    "kegg",
+    "metacyc",
+    "tinit",
+]
+classifiers = [
+    "Development Status :: 2 - Pre-Alpha",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Topic :: Scientific/Engineering :: Bio-Informatics",
+]
+dependencies = [
+    "cobra>=0.29",
+    "numpy>=1.21",
+    "pandas>=2",
+    "scipy>=1.10",
+    "ruamel.yaml>=0.17",
+    "requests>=2.28",
+    "tqdm>=4.65",
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest>=7",
+    "pytest-cov",
+    "ruff>=0.4",
+]
+excel = [
+    "openpyxl>=3.1",
+]
+plotting = [
+    "matplotlib>=3.5",
+]
+
+[project.urls]
+Homepage = "https://github.com/SysBioChalmers/raven-python"
+Source = "https://github.com/SysBioChalmers/raven-python"
+Issues = "https://github.com/SysBioChalmers/raven-python/issues"
+"RAVEN MATLAB" = "https://github.com/SysBioChalmers/RAVEN"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/raven_python"]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+
+[tool.ruff]
+line-length = 100
+target-version = "py311"
+
+[tool.ruff.lint]
+select = ["E", "F", "W", "I", "UP", "B"]
+ignore = [
+    "E501",  # line length handled by the formatter
+]
+
+[tool.ruff.lint.per-file-ignores]
+"tests/*" = ["E402"]
diff --git a/src/raven_python/__init__.py b/src/raven_python/__init__.py
new file mode 100644
index 0000000..591c4c1
--- /dev/null
+++ b/src/raven_python/__init__.py
@@ -0,0 +1,10 @@
+"""raven_python — Python counterpart of the RAVEN Toolbox, built on cobrapy.
+
+raven_python reuses cobrapy for simulation, standard analyses, SBML I/O, and model
+manipulation, and provides the RAVEN-specific functionality on top: de novo
+reconstruction (KEGG / homology), context-specific modeling (tINIT / ftINIT),
+metabolic task validation, connectivity gap-filling, omics integration (HPA),
+sub-cellular localisation, N-model comparison, and the RAVEN-style I/O formats.
+"""
+
+__version__ = "0.0.1"

From b7b69ac4f5cc00831df993fc718dd960ab82e0df Mon Sep 17 00:00:00 2001
From: Eduard Kerkhoven <eduardk@chalmers.se>
Date: Fri, 29 May 2026 22:45:40 +0200
Subject: [PATCH 2/7] Add GitHub Actions CI and the maintainer-scripts README

---
 .github/workflows/ci.yml | 48 ++++++++++++++++++++++++++++++++++++++++
 scripts/README.md        | 35 +++++++++++++++++++++++++++++
 2 files changed, 83 insertions(+)
 create mode 100644 .github/workflows/ci.yml
 create mode 100644 scripts/README.md

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..3f6d19b
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,48 @@
+name: CI
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+  workflow_dispatch:
+
+# A push that obsoletes a previous run cancels it.
+concurrency:
+  group: ci-${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  lint:
+    name: ruff
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+          cache: pip
+          cache-dependency-path: pyproject.toml
+      - run: pip install --upgrade pip
+      - run: pip install ruff
+      - run: ruff check .
+
+  test:
+    name: pytest (py${{ matrix.python }})
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python: ["3.11", "3.12", "3.13"]
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python }}
+          cache: pip
+          cache-dependency-path: pyproject.toml
+      - run: pip install --upgrade pip
+      # ``-e .[dev,plotting,excel]`` so every optional extra is exercised.
+      # Gurobi is not installable on free runners; the relevant tests
+      # skip themselves when ``optlang.gurobi_interface`` cannot import.
+      - run: pip install -e ".[dev,plotting,excel]"
+      - run: pytest -q --maxfail=5 --durations=20
diff --git a/scripts/README.md b/scripts/README.md
new file mode 100644
index 0000000..67a3b36
--- /dev/null
+++ b/scripts/README.md
@@ -0,0 +1,35 @@
+# Maintainer scripts
+
+Release-time tooling. Not part of the installed package — run them from a checkout
+with raven-python installed (`pip install -e .`). End users never need these.
+
+## `build_kegg_artefacts.py`
+
+Build the publishable KEGG artefact set from an arranged KEGG dump (see
+`download_kegg_dump`): the gzipped-YAML reference model, the gzipped-TSV tables,
+and (with `--hmms`) the per-domain pressed HMM libraries. Output is laid out ready
+to upload as release assets. See [docs/maintaining_kegg_data.md](../docs/maintaining_kegg_data.md).
+
+```bash
+python scripts/build_kegg_artefacts.py --keggdb keggdb --out artefacts          # tables + model
+python scripts/build_kegg_artefacts.py --keggdb keggdb --out artefacts --hmms --threads 8
+```
+
+## `make_registry_snippet.py`
+
+After uploading the files to a release, compute their SHA256 and print the entry
+to merge into the runtime registry — `raven_python.data._DATA_REGISTRY` (data) or
+`raven_python.binaries._REGISTRY` (binary ZIP bundles). The checksum helper is shared
+with the resolvers, so published checksums always match what `ensure_data` /
+`ensure_binary` verify.
+
+```bash
+# Data artefacts:
+python scripts/make_registry_snippet.py data --dataset kegg --version kegg116 \
+    --dir artefacts --base-url https://github.com/ORG/raven-python/releases/download/kegg-data-kegg116
+
+# Binary bundle (ZIPs named <bundle>-<version>-<os>-<arch>.zip):
+python scripts/make_registry_snippet.py binary --bundle blast --version 2.16.0 \
+    --provides blastp makeblastdb --dir zips \
+    --base-url https://github.com/ORG/raven-python/releases/download/blast-2.16.0
+```

From 50bea4006f174a3549e670fd3ea622f8e31083cf Mon Sep 17 00:00:00 2001
From: Eduard Kerkhoven <eduardk@chalmers.se>
Date: Fri, 29 May 2026 22:46:42 +0200
Subject: [PATCH 3/7] Add the foundation utilities: GPR, balance, parse, sort,
 validate

---
 src/raven_python/utils/__init__.py |  16 ++++
 src/raven_python/utils/balance.py  |  89 +++++++++++++++++++++
 src/raven_python/utils/gpr.py      | 119 +++++++++++++++++++++++++++++
 src/raven_python/utils/parse.py    |  33 ++++++++
 src/raven_python/utils/sort.py     |  21 +++++
 src/raven_python/utils/validate.py |  86 +++++++++++++++++++++
 tests/test_utils_balance.py        |  76 ++++++++++++++++++
 tests/test_utils_gpr.py            |  84 ++++++++++++++++++++
 tests/test_utils_sort.py           |  42 ++++++++++
 tests/test_utils_validate.py       |  80 +++++++++++++++++++
 10 files changed, 646 insertions(+)
 create mode 100644 src/raven_python/utils/__init__.py
 create mode 100644 src/raven_python/utils/balance.py
 create mode 100644 src/raven_python/utils/gpr.py
 create mode 100644 src/raven_python/utils/parse.py
 create mode 100644 src/raven_python/utils/sort.py
 create mode 100644 src/raven_python/utils/validate.py
 create mode 100644 tests/test_utils_balance.py
 create mode 100644 tests/test_utils_gpr.py
 create mode 100644 tests/test_utils_sort.py
 create mode 100644 tests/test_utils_validate.py

diff --git a/src/raven_python/utils/__init__.py b/src/raven_python/utils/__init__.py
new file mode 100644
index 0000000..7127bdd
--- /dev/null
+++ b/src/raven_python/utils/__init__.py
@@ -0,0 +1,16 @@
+"""Shared helpers — GPR linting, elemental balance, model curation checks, id sorting."""
+from raven_python.utils.balance import ElementalBalance, get_elemental_balance
+from raven_python.utils.gpr import GPRIssue, find_non_dnf_grrules, is_dnf
+from raven_python.utils.sort import sort_identifiers
+from raven_python.utils.validate import ModelIssue, check_model
+
+__all__ = [
+    "ElementalBalance",
+    "GPRIssue",
+    "ModelIssue",
+    "check_model",
+    "find_non_dnf_grrules",
+    "get_elemental_balance",
+    "is_dnf",
+    "sort_identifiers",
+]
diff --git a/src/raven_python/utils/balance.py b/src/raven_python/utils/balance.py
new file mode 100644
index 0000000..ee64ab4
--- /dev/null
+++ b/src/raven_python/utils/balance.py
@@ -0,0 +1,89 @@
+"""Check the elemental balance of reactions, distinguishing *unbalanced* from
+*unknown* (missing formula).
+
+cobra's ``reaction.check_mass_balance()`` silently treats a missing formula as
+empty, so a reaction can look "unbalanced" — or even balanced — when the truth is
+that the data is incomplete. This module checks for missing formulas first and
+returns a graded status
+per reaction (``balanced`` / ``unbalanced`` / ``unknown``) plus the element
+imbalance — over a batch, as structured data.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+import cobra
+
+
+@dataclass(frozen=True)
+class ElementalBalance:
+    """Balance result for one reaction.
+
+    Attributes
+    ----------
+    reaction_id
+        ID of the reaction.
+    status
+        ``"balanced"`` — elements balance;
+        ``"unbalanced"`` — they do not (see ``imbalance``);
+        ``"unknown"`` — at least one metabolite has no formula, so it cannot be
+        determined (cobra would silently miscount these).
+    imbalance
+        Element → net coefficient (products − reactants), only for
+        ``"unbalanced"``; empty otherwise. Charge is not included.
+    """
+
+    reaction_id: str
+    status: str
+    imbalance: dict[str, float] = field(default_factory=dict)
+
+
+def get_elemental_balance(
+    model: cobra.Model, reactions=None
+) -> list[ElementalBalance]:
+    """Check whether reactions are elementally balanced.
+    Parameters
+    ----------
+    reactions
+        Reaction IDs/objects to check; default all reactions. (Boundary
+        reactions exchange mass with the environment and will read as
+        ``unbalanced`` — filter them out if that is not wanted.)
+
+    Returns
+    -------
+    list of ElementalBalance
+        One entry per checked reaction, in model order.
+    """
+    if reactions is None:
+        rxns = list(model.reactions)
+    else:
+        if isinstance(reactions, (str, cobra.Reaction)):
+            reactions = [reactions]
+        rxns = [
+            r if isinstance(r, cobra.Reaction) else model.reactions.get_by_id(r)
+            for r in reactions
+        ]
+
+    results: list[ElementalBalance] = []
+    for rxn in rxns:
+        if not rxn.metabolites:
+            # A reaction with no metabolites used to fall through to ``balanced``
+            # (vacuously) because ``any()`` over the empty list is False and the
+            # zero-element imbalance dict is empty. Treat the no-formula case
+            # (zero formulae present) as ``unknown``: we can't determine balance
+            # for a reaction without stoichiometry.
+            results.append(ElementalBalance(rxn.id, "unknown"))
+            continue
+        if any(not met.formula for met in rxn.metabolites):
+            results.append(ElementalBalance(rxn.id, "unknown"))
+            continue
+        imbalance = {
+            element: amount
+            for element, amount in rxn.check_mass_balance().items()
+            if element != "charge"
+        }
+        if imbalance:
+            results.append(ElementalBalance(rxn.id, "unbalanced", imbalance))
+        else:
+            results.append(ElementalBalance(rxn.id, "balanced"))
+    return results
diff --git a/src/raven_python/utils/gpr.py b/src/raven_python/utils/gpr.py
new file mode 100644
index 0000000..2e2122d
--- /dev/null
+++ b/src/raven_python/utils/gpr.py
@@ -0,0 +1,119 @@
+"""GPR (gene-protein-reaction rule) linting.
+
+Flag GPRs that are *not* in disjunctive normal form ("OR of AND-complexes"), via cobra's
+GPR AST. GPR syntax *normalisation* is already done by cobra on assignment, so it isn't
+re-implemented here.
+
+Part (2) has no cobrapy equivalent and is ported here, reworked onto cobra's
+GPR AST instead of RAVEN's brittle substring search. The relevant property is
+**disjunctive normal form (DNF)**: an OR of AND-clauses of single genes, e.g.
+``(G1 and G2) or G3``. Rules where an AND contains an OR — e.g.
+``(G1 or G2) and (G3 or G4)`` — are *valid* for cobra but ambiguous for the
+isoenzyme/complex reasoning used across RAVEN/GECKO, and ``expand_model``
+(see :mod:`raven_python.manipulation.expand`) only does something for DNF rules.
+:func:`find_non_dnf_grrules` surfaces them as structured data rather than, as
+RAVEN did, only printing a warning.
+"""
+from __future__ import annotations
+
+import ast
+from dataclasses import dataclass
+
+import cobra
+from cobra.core.gene import GPR
+
+
+def _contains_or(node: ast.AST | None) -> bool:
+    """True if ``node``'s subtree contains an OR operator anywhere."""
+    if isinstance(node, ast.BoolOp):
+        if isinstance(node.op, ast.Or):
+            return True
+        return any(_contains_or(value) for value in node.values)
+    return False
+
+
+def _is_dnf_node(node: ast.AST | None) -> bool:
+    """True if the AST rooted at ``node`` is in disjunctive normal form.
+
+    DNF here means no AND operator has an OR anywhere beneath it, i.e. the
+    rule is a single gene, a pure AND-complex, or an OR of those.
+    """
+    if node is None or isinstance(node, ast.Name):
+        return True
+    if isinstance(node, ast.BoolOp):
+        if isinstance(node.op, ast.And):
+            return not any(_contains_or(value) for value in node.values)
+        # OR: every disjunct must itself be DNF
+        return all(_is_dnf_node(value) for value in node.values)
+    # Unknown node type: don't flag it as a problem.
+    return True
+
+
+def is_dnf(gpr: GPR | str | None) -> bool:
+    """Return whether a GPR is in disjunctive normal form (OR of AND-complexes).
+
+    Parameters
+    ----------
+    gpr
+        A cobra :class:`~cobra.core.gene.GPR`, a grRule string, or ``None``.
+        An empty/``None`` rule is trivially DNF.
+
+    Examples
+    --------
+    >>> is_dnf("(G1 and G2) or G3")
+    True
+    >>> is_dnf("(G1 or G2) and G3")
+    False
+    """
+    if isinstance(gpr, str):
+        gpr = GPR.from_string(gpr)
+    if gpr is None:
+        return True
+    return _is_dnf_node(gpr.body)
+
+
+@dataclass(frozen=True)
+class GPRIssue:
+    """A reaction whose GPR is flagged by the linter.
+
+    Attributes
+    ----------
+    reaction_id
+        ID of the reaction.
+    gpr
+        The (already cobra-normalised) grRule string.
+    reason
+        Human-readable explanation of why it was flagged.
+    """
+
+    reaction_id: str
+    gpr: str
+    reason: str
+
+
+_NON_DNF_REASON = (
+    "GPR is not in disjunctive normal form (an AND clause contains an OR). "
+    "Isoenzyme/complex reasoning and expand_model assume an OR of AND-complexes, "
+    'e.g. rewrite "(G1 or G2) and (G3 or G4)" as '
+    '"(G1 and G3) or (G1 and G4) or (G2 and G3) or (G2 and G4)".'
+)
+
+
+def find_non_dnf_grrules(model: cobra.Model) -> list[GPRIssue]:
+    """Find reactions whose GPR is not in disjunctive normal form ("OR of AND-complexes").
+
+    Uses cobra's GPR AST. Reactions with no GPR are skipped.
+
+    Returns
+    -------
+    list of GPRIssue
+        One entry per flagged reaction, in model reaction order. Empty if all
+        GPRs are simple OR-of-AND-complexes.
+    """
+    issues: list[GPRIssue] = []
+    for rxn in model.reactions:
+        if not rxn.gene_reaction_rule:
+            continue
+        if not is_dnf(rxn.gpr):
+            issues.append(GPRIssue(rxn.id, rxn.gene_reaction_rule, _NON_DNF_REASON))
+    return issues
diff --git a/src/raven_python/utils/parse.py b/src/raven_python/utils/parse.py
new file mode 100644
index 0000000..8068f6c
--- /dev/null
+++ b/src/raven_python/utils/parse.py
@@ -0,0 +1,33 @@
+"""Small parsing helpers shared across raven_python."""
+from __future__ import annotations
+
+import re
+
+# A metabolite written as ``name[comp]``. The name is greedy so that, for a
+# pathological name that itself contains brackets, the *last* ``[...]`` is taken
+# as the compartment (matching RAVEN getIndexes' ``max(strfind('['))`` rule).
+_NAME_COMP_RE = re.compile(r"^(?P<name>.+)\[(?P<comp>[^\[\]]+)\]$")
+
+
+def parse_name_comp(token: str) -> tuple[str, str | None]:
+    """Split a ``name[comp]`` token into ``(name, compartment)``.
+
+    This is the one genuinely cobra-absent sliver of RAVEN ``getIndexes``'
+    ``metcomps`` mode and ``addRxns`` eqnType 3: resolving a metabolite written
+    as its *name* plus a compartment in square brackets, e.g. ``"ATP[c]"``.
+
+    Returns ``(name, None)`` when there is no trailing ``[...]``.
+
+    Examples
+    --------
+    >>> parse_name_comp("ATP[c]")
+    ('ATP', 'c')
+    >>> parse_name_comp("ATP")
+    ('ATP', None)
+    >>> parse_name_comp("weird[name][m]")
+    ('weird[name]', 'm')
+    """
+    match = _NAME_COMP_RE.match(token.strip())
+    if match:
+        return match.group("name").strip(), match.group("comp").strip()
+    return token.strip(), None
diff --git a/src/raven_python/utils/sort.py b/src/raven_python/utils/sort.py
new file mode 100644
index 0000000..a8641a8
--- /dev/null
+++ b/src/raven_python/utils/sort.py
@@ -0,0 +1,21 @@
+"""Sort a model's identifiers alphabetically — useful for deterministic,
+diff-friendly output.
+
+cobra's ``DictList.sort`` reorders one list (and rebuilds its lookup index), but
+there is no single "sort the whole model" call; this provides it.
+"""
+from __future__ import annotations
+
+import cobra
+
+
+def sort_identifiers(model: cobra.Model) -> cobra.Model:
+    """Sort reactions, metabolites and genes alphabetically by ID, in place.
+
+    Returns the same (mutated) model for convenience. Compartments are a plain
+    dict and are emitted sorted by writers as needed.
+    """
+    model.reactions.sort(key=lambda r: r.id)
+    model.metabolites.sort(key=lambda m: m.id)
+    model.genes.sort(key=lambda g: g.id)
+    return model
diff --git a/src/raven_python/utils/validate.py b/src/raven_python/utils/validate.py
new file mode 100644
index 0000000..c08df48
--- /dev/null
+++ b/src/raven_python/utils/validate.py
@@ -0,0 +1,86 @@
+"""Curation checks for a model.
+
+A QC bundle cobra has no single call for: orphaned objects, empty reactions,
+duplicated metabolite ``name + compartment``, empty names, and objective sanity.
+:func:`check_model` returns these as structured :class:`ModelIssue` records.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import cobra
+
+
+@dataclass(frozen=True)
+class ModelIssue:
+    """One curation issue found in a model.
+
+    Attributes
+    ----------
+    category
+        Machine-readable kind, e.g. ``"orphan_metabolite"``, ``"empty_reaction"``,
+        ``"orphan_gene"``, ``"duplicate_name_compartment"``,
+        ``"empty_metabolite_name"``, ``"objective"``.
+    object_id
+        ID of the offending object, or ``None`` for model-level issues.
+    message
+        Human-readable description.
+    """
+
+    category: str
+    object_id: str | None
+    message: str
+
+
+def check_model(model: cobra.Model) -> list[ModelIssue]:
+    """Run curation checks on a model and return the issues found.
+
+    Does not
+    raise; returns a (possibly empty) list of :class:`ModelIssue`.
+    """
+    issues: list[ModelIssue] = []
+
+    for met in model.metabolites:
+        if not met.reactions:
+            issues.append(
+                ModelIssue("orphan_metabolite", met.id, f"Metabolite {met.id!r} is not used in any reaction.")
+            )
+        if not (met.name and str(met.name).strip()):
+            issues.append(
+                ModelIssue("empty_metabolite_name", met.id, f"Metabolite {met.id!r} has no name.")
+            )
+
+    for gene in model.genes:
+        if not gene.reactions:
+            issues.append(
+                ModelIssue("orphan_gene", gene.id, f"Gene {gene.id!r} is not associated with any reaction.")
+            )
+
+    for rxn in model.reactions:
+        if not rxn.metabolites:
+            issues.append(
+                ModelIssue("empty_reaction", rxn.id, f"Reaction {rxn.id!r} has no metabolites.")
+            )
+
+    by_name_comp: dict[tuple[str, str], list[str]] = {}
+    for met in model.metabolites:
+        by_name_comp.setdefault((met.name, met.compartment), []).append(met.id)
+    for (name, comp), ids in by_name_comp.items():
+        if name and len(ids) > 1:
+            issues.append(
+                ModelIssue(
+                    "duplicate_name_compartment",
+                    None,
+                    f"{len(ids)} metabolites share name {name!r} in compartment {comp!r}: {sorted(ids)}",
+                )
+            )
+
+    objective_rxns = [r.id for r in model.reactions if r.objective_coefficient != 0]
+    if not objective_rxns:
+        issues.append(ModelIssue("objective", None, "No reaction has a nonzero objective coefficient."))
+    elif len(objective_rxns) > 1:
+        issues.append(
+            ModelIssue("objective", None, f"Multiple objective reactions: {sorted(objective_rxns)}")
+        )
+
+    return issues
diff --git a/tests/test_utils_balance.py b/tests/test_utils_balance.py
new file mode 100644
index 0000000..aa1e47e
--- /dev/null
+++ b/tests/test_utils_balance.py
@@ -0,0 +1,76 @@
+"""Tests for get_elemental_balance (getElementalBalance port)."""
+import cobra
+import pytest
+
+from raven_python.utils import ElementalBalance, get_elemental_balance
+
+
+@pytest.fixture
+def model():
+    m = cobra.Model("t")
+    m.add_metabolites(
+        [
+            cobra.Metabolite("a_c", formula="C6H12O6", charge=0, compartment="c"),
+            cobra.Metabolite("b_c", formula="C6H12O6", charge=0, compartment="c"),
+            cobra.Metabolite("c_c", formula="C3H6O3", charge=0, compartment="c"),
+            cobra.Metabolite("n_c", compartment="c"),  # no formula
+        ]
+    )
+    r_bal = cobra.Reaction("R_bal")
+    m.add_reactions([r_bal])
+    r_bal.build_reaction_from_string("a_c --> b_c")        # C6H12O6 -> C6H12O6
+    r_unbal = cobra.Reaction("R_unbal")
+    m.add_reactions([r_unbal])
+    r_unbal.build_reaction_from_string("a_c --> c_c")      # C6H12O6 -> C3H6O3
+    r_unknown = cobra.Reaction("R_unknown")
+    m.add_reactions([r_unknown])
+    r_unknown.build_reaction_from_string("a_c --> n_c")    # n_c has no formula
+    return m
+
+
+def test_balanced(model):
+    (res,) = get_elemental_balance(model, ["R_bal"])
+    assert res == ElementalBalance("R_bal", "balanced", {})
+
+
+def test_unbalanced_reports_imbalance(model):
+    (res,) = get_elemental_balance(model, ["R_unbal"])
+    assert res.status == "unbalanced"
+    # products - reactants: C3H6O3 - C6H12O6 = -C3H6O3
+    assert res.imbalance == {"C": -3.0, "H": -6.0, "O": -3.0}
+
+
+def test_missing_formula_is_unknown_not_silently_wrong(model):
+    # cobra's check_mass_balance alone would silently report an imbalance here;
+    # we flag it as unknown instead.
+    (res,) = get_elemental_balance(model, ["R_unknown"])
+    assert res.status == "unknown"
+    assert res.imbalance == {}
+
+
+def test_all_reactions_default(model):
+    results = get_elemental_balance(model)
+    assert {r.reaction_id: r.status for r in results} == {
+        "R_bal": "balanced",
+        "R_unbal": "unbalanced",
+        "R_unknown": "unknown",
+    }
+
+
+def test_charge_excluded(model):
+    # give a charge imbalance but keep elements balanced -> still "balanced"
+    model.metabolites.get_by_id("b_c").charge = 1
+    (res,) = get_elemental_balance(model, ["R_bal"])
+    assert res.status == "balanced"
+
+
+# --- regression: empty reaction → unknown (known_issues.md F5) -------------
+
+def test_empty_reaction_is_unknown(model):
+    """A reaction with no metabolites used to be reported `balanced`
+    vacuously (any() over an empty list is False and check_mass_balance
+    returns no imbalance). Now reports `unknown`."""
+    empty = cobra.Reaction("R_empty", lower_bound=0, upper_bound=1000)
+    model.add_reactions([empty])
+    (res,) = get_elemental_balance(model, ["R_empty"])
+    assert res.status == "unknown"
diff --git a/tests/test_utils_gpr.py b/tests/test_utils_gpr.py
new file mode 100644
index 0000000..275d020
--- /dev/null
+++ b/tests/test_utils_gpr.py
@@ -0,0 +1,84 @@
+"""Tests for raven_python.utils.gpr (GPR linting)."""
+import cobra
+import pytest
+
+from raven_python.utils import GPRIssue, find_non_dnf_grrules, is_dnf
+
+
+@pytest.mark.parametrize(
+    "rule",
+    [
+        "",
+        "G1",
+        "G1 and G2",
+        "G1 or G2",
+        "G1 and G2 and G3",
+        "G1 or G2 or G3",
+        "(G1 and G2) or G3",
+        "(G1 and G2) or (G3 and G4)",
+        "G1 or (G2 and G3)",
+    ],
+)
+def test_is_dnf_true(rule):
+    assert is_dnf(rule) is True
+
+
+@pytest.mark.parametrize(
+    "rule",
+    [
+        "(G1 or G2) and G3",
+        "G1 and (G2 or G3)",
+        "(G1 or G2) and (G3 or G4)",
+        "G1 and (G2 or (G3 and G4))",
+    ],
+)
+def test_is_dnf_false(rule):
+    assert is_dnf(rule) is False
+
+
+def test_is_dnf_accepts_gpr_and_none():
+    from cobra.core.gene import GPR
+
+    assert is_dnf(GPR.from_string("(G1 or G2) and G3")) is False
+    assert is_dnf(GPR.from_string("G1 or G2")) is True
+    assert is_dnf(None) is True
+
+
+def test_is_dnf_independent_of_formatting():
+    # cobra normalises on assignment, so casing/whitespace cannot change the verdict.
+    assert is_dnf("(G1 OR G2)   AND   G3") is False
+    assert is_dnf("( G1 and G2 )  or  G3") is True
+
+
+def _model_with_rules(rules: dict[str, str]) -> cobra.Model:
+    model = cobra.Model("t")
+    model.add_reactions([cobra.Reaction(rid) for rid in rules])
+    for rid, rule in rules.items():
+        model.reactions.get_by_id(rid).gene_reaction_rule = rule
+    return model
+
+
+def test_find_non_dnf_grrules_flags_only_offenders():
+    model = _model_with_rules(
+        {
+            "R_ok_single": "G1",
+            "R_ok_complex": "G1 and G2",
+            "R_ok_dnf": "(G1 and G2) or G3",
+            "R_no_gpr": "",
+            "R_bad_1": "(G1 or G2) and G3",
+            "R_bad_2": "(G1 or G2) and (G3 or G4)",
+        }
+    )
+
+    issues = find_non_dnf_grrules(model)
+
+    assert [i.reaction_id for i in issues] == ["R_bad_1", "R_bad_2"]
+    assert all(isinstance(i, GPRIssue) for i in issues)
+    assert all("disjunctive normal form" in i.reason for i in issues)
+    # the reported GPR is the cobra-normalised string
+    assert issues[0].gpr == "(G1 or G2) and G3"
+
+
+def test_find_non_dnf_grrules_empty_when_all_clean():
+    model = _model_with_rules({"R1": "G1 or G2", "R2": "(G1 and G2) or G3"})
+    assert find_non_dnf_grrules(model) == []
diff --git a/tests/test_utils_sort.py b/tests/test_utils_sort.py
new file mode 100644
index 0000000..18bca24
--- /dev/null
+++ b/tests/test_utils_sort.py
@@ -0,0 +1,42 @@
+"""Tests for sort_identifiers and write_yaml_model(sort_ids=True)."""
+import cobra
+
+from raven_python.io import read_yaml_model, write_yaml_model
+from raven_python.manipulation import add_reactions_from_equations
+from raven_python.utils import sort_identifiers
+
+
+def _model():
+    m = cobra.Model("t")
+    m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("b_c", "a_c")])
+    add_reactions_from_equations(
+        m,
+        [
+            {"id": "R2", "equation": "a_c --> b_c", "gene_reaction_rule": "GB"},
+            {"id": "R1", "equation": "b_c --> a_c", "gene_reaction_rule": "GA"},
+        ],
+    )
+    return m
+
+
+def test_sort_identifiers_orders_everything():
+    m = _model()
+    sort_identifiers(m)
+    assert [r.id for r in m.reactions] == ["R1", "R2"]
+    assert [x.id for x in m.metabolites] == ["a_c", "b_c"]
+    assert [g.id for g in m.genes] == ["GA", "GB"]
+    # lookup index still intact after sorting
+    assert m.reactions.get_by_id("R2").id == "R2"
+
+
+def test_write_yaml_sort_ids_does_not_mutate(tmp_path):
+    m = _model()
+    order_before = [r.id for r in m.reactions]
+    out = tmp_path / "m.yml"
+    write_yaml_model(m, out, sort_ids=True)
+    assert [r.id for r in m.reactions] == order_before  # model untouched
+    # but the file is sorted
+    text = out.read_text()
+    assert text.index("R1") < text.index("R2")
+    reloaded = read_yaml_model(out)
+    assert [r.id for r in reloaded.reactions] == ["R1", "R2"]
diff --git a/tests/test_utils_validate.py b/tests/test_utils_validate.py
new file mode 100644
index 0000000..2d38e6f
--- /dev/null
+++ b/tests/test_utils_validate.py
@@ -0,0 +1,80 @@
+"""Tests for check_model (the surviving checks of checkModelStruct)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import add_reactions_from_equations
+from raven_python.utils import ModelIssue, check_model
+
+
+def _categories(issues, category):
+    return [i.object_id for i in issues if i.category == category]
+
+
+@pytest.fixture
+def model():
+    m = cobra.Model("t")
+    m.add_metabolites(
+        [
+            cobra.Metabolite("a_c", name="A", compartment="c"),
+            cobra.Metabolite("b_c", name="B", compartment="c"),
+        ]
+    )
+    add_reactions_from_equations(
+        m, [{"id": "R1", "equation": "a_c --> b_c", "gene_reaction_rule": "G1"}]
+    )
+    m.reactions.get_by_id("R1").objective_coefficient = 1
+    return m
+
+
+def test_clean_model_has_no_issues(model):
+    assert check_model(model) == []
+
+
+def test_orphan_metabolite(model):
+    model.add_metabolites([cobra.Metabolite("orphan_c", name="Orphan", compartment="c")])
+    assert "orphan_c" in _categories(check_model(model), "orphan_metabolite")
+
+
+def test_orphan_gene(model):
+    model.genes.append(cobra.core.gene.Gene("G_lonely"))
+    assert "G_lonely" in _categories(check_model(model), "orphan_gene")
+
+
+def test_empty_reaction(model):
+    model.add_reactions([cobra.Reaction("R_empty")])
+    assert "R_empty" in _categories(check_model(model), "empty_reaction")
+
+
+def test_empty_metabolite_name(model):
+    model.add_metabolites([cobra.Metabolite("noname_c", compartment="c")])
+    # also an orphan, but we check the name category specifically
+    assert "noname_c" in _categories(check_model(model), "empty_metabolite_name")
+
+
+def test_duplicate_name_compartment(model):
+    # second metabolite named "A" in compartment c
+    dup = cobra.Metabolite("a2_c", name="A", compartment="c")
+    model.add_metabolites([dup])
+    model.reactions.get_by_id("R1").add_metabolites({dup: -1})  # keep it used
+    issues = [i for i in check_model(model) if i.category == "duplicate_name_compartment"]
+    assert len(issues) == 1
+    assert "a_c" in issues[0].message and "a2_c" in issues[0].message
+
+
+def test_no_objective(model):
+    model.reactions.get_by_id("R1").objective_coefficient = 0
+    cats = [i.category for i in check_model(model)]
+    assert "objective" in cats
+
+
+def test_multiple_objectives(model):
+    add_reactions_from_equations(model, [{"id": "R2", "equation": "b_c --> a_c"}])
+    model.reactions.get_by_id("R2").objective_coefficient = 1
+    obj_issues = [i for i in check_model(model) if i.category == "objective"]
+    assert len(obj_issues) == 1
+    assert "Multiple" in obj_issues[0].message
+
+
+def test_returns_model_issue_instances(model):
+    model.add_reactions([cobra.Reaction("R_empty")])
+    assert all(isinstance(i, ModelIssue) for i in check_model(model))

From 4bc0d6ee29070a0bad4e347a78947c7fae8199a0 Mon Sep 17 00:00:00 2001
From: Eduard Kerkhoven <eduardk@chalmers.se>
Date: Fri, 29 May 2026 22:46:59 +0200
Subject: [PATCH 4/7] Add the model-manipulation layer (add, remove, transport,
 merge, etc.)

---
 src/raven_python/manipulation/__init__.py     |  36 ++
 src/raven_python/manipulation/add.py          | 345 ++++++++++++++++++
 src/raven_python/manipulation/change.py       | 125 +++++++
 src/raven_python/manipulation/compartments.py | 196 ++++++++++
 src/raven_python/manipulation/expand.py       | 124 +++++++
 src/raven_python/manipulation/irreversible.py |  72 ++++
 src/raven_python/manipulation/merge.py        | 146 ++++++++
 src/raven_python/manipulation/parameters.py   |  78 ++++
 src/raven_python/manipulation/remove.py       | 120 ++++++
 src/raven_python/manipulation/simplify.py     | 229 ++++++++++++
 src/raven_python/manipulation/transfer.py     | 144 ++++++++
 src/raven_python/manipulation/transport.py    | 157 ++++++++
 tests/test_change_grrules.py                  |  49 +++
 tests/test_manipulation_add.py                | 278 ++++++++++++++
 tests/test_manipulation_change.py             |  93 +++++
 tests/test_manipulation_compartments.py       | 139 +++++++
 tests/test_manipulation_expand.py             | 288 +++++++++++++++
 tests/test_manipulation_irreversible.py       | 144 ++++++++
 tests/test_manipulation_merge.py              | 136 +++++++
 tests/test_manipulation_remove.py             |  97 +++++
 tests/test_manipulation_simplify.py           | 184 ++++++++++
 tests/test_manipulation_transfer.py           | 137 +++++++
 tests/test_manipulation_transport.py          |  98 +++++
 tests/test_parameters.py                      |  60 +++
 24 files changed, 3475 insertions(+)
 create mode 100644 src/raven_python/manipulation/__init__.py
 create mode 100644 src/raven_python/manipulation/add.py
 create mode 100644 src/raven_python/manipulation/change.py
 create mode 100644 src/raven_python/manipulation/compartments.py
 create mode 100644 src/raven_python/manipulation/expand.py
 create mode 100644 src/raven_python/manipulation/irreversible.py
 create mode 100644 src/raven_python/manipulation/merge.py
 create mode 100644 src/raven_python/manipulation/parameters.py
 create mode 100644 src/raven_python/manipulation/remove.py
 create mode 100644 src/raven_python/manipulation/simplify.py
 create mode 100644 src/raven_python/manipulation/transfer.py
 create mode 100644 src/raven_python/manipulation/transport.py
 create mode 100644 tests/test_change_grrules.py
 create mode 100644 tests/test_manipulation_add.py
 create mode 100644 tests/test_manipulation_change.py
 create mode 100644 tests/test_manipulation_compartments.py
 create mode 100644 tests/test_manipulation_expand.py
 create mode 100644 tests/test_manipulation_irreversible.py
 create mode 100644 tests/test_manipulation_merge.py
 create mode 100644 tests/test_manipulation_remove.py
 create mode 100644 tests/test_manipulation_simplify.py
 create mode 100644 tests/test_manipulation_transfer.py
 create mode 100644 tests/test_manipulation_transport.py
 create mode 100644 tests/test_parameters.py

diff --git a/src/raven_python/manipulation/__init__.py b/src/raven_python/manipulation/__init__.py
new file mode 100644
index 0000000..074c36f
--- /dev/null
+++ b/src/raven_python/manipulation/__init__.py
@@ -0,0 +1,36 @@
+"""Generic cobra.Model structural transforms that cobrapy does not cover cleanly:
+reaction building from equations, batch GPR / bound changes, irreversibility splitting,
+isozyme expansion, compartment merge / copy, and model merging by name."""
+from .add import add_reactions_from_equations
+from .change import change_gene_reaction_rules, change_reaction_equations
+from .expand import expand_model
+from .irreversible import convert_to_irreversible
+from .merge import merge_models
+from .parameters import set_variance_bounds
+from .remove import remove_genes, remove_metabolites
+from .simplify import (
+    constrain_reversible_reactions,
+    group_linear_reactions,
+    remove_dead_end_reactions,
+    remove_duplicate_reactions,
+)
+from .transfer import add_reactions_from_model
+from .transport import add_transport_reactions
+
+__all__ = [
+    "add_reactions_from_equations",
+    "add_reactions_from_model",
+    "add_transport_reactions",
+    "change_gene_reaction_rules",
+    "change_reaction_equations",
+    "constrain_reversible_reactions",
+    "convert_to_irreversible",
+    "expand_model",
+    "group_linear_reactions",
+    "merge_models",
+    "remove_dead_end_reactions",
+    "remove_duplicate_reactions",
+    "remove_genes",
+    "remove_metabolites",
+    "set_variance_bounds",
+]
diff --git a/src/raven_python/manipulation/add.py b/src/raven_python/manipulation/add.py
new file mode 100644
index 0000000..3842297
--- /dev/null
+++ b/src/raven_python/manipulation/add.py
@@ -0,0 +1,345 @@
+"""Add reactions to a model from equation strings.
+
+Most of the equivalent MATLAB code is struct-of-arrays bookkeeping (padding parallel
+``rxnNames`` / ``lb`` / ``ub`` / ``grRules`` / ... fields) that does not exist in
+cobra, where each ``Reaction`` carries its own attributes. cobra also already
+covers a large part of the *behaviour*:
+
+* ``Reaction.build_reaction_from_string`` parses equation strings, coefficients,
+  and reversibility arrows (``<=>``, ``-->``, ``=>``) and creates unknown
+  metabolites — but only matching metabolites **by ID**, and it leaves new
+  metabolites with ``compartment=None``.
+* assigning ``reaction.gene_reaction_rule`` auto-creates ``Gene`` objects.
+
+So this port keeps only the parts cobra lacks:
+
+* **name-based matching** — interpret equation tokens as metabolite *names*
+  (RAVEN eqnType 2) or as ``name[comp]`` (eqnType 3), not just IDs;
+* **correct compartment** assignment for newly created metabolites;
+* **strict policies** — optionally *error* (rather than silently create) on
+  unknown metabolites or genes, and always error on a duplicate reaction ID
+  (cobra silently ignores those).
+
+Instead of RAVEN's ``eqnType`` integer (1/2/3) the matching mode is a readable
+keyword: ``mets_by="id"`` or ``mets_by="name"``, with ``name[comp]`` recognised
+automatically. See IMPROVEMENTS.md (A-series) for the rationale.
+"""
+from __future__ import annotations
+
+import re
+import warnings
+from collections import OrderedDict
+from collections.abc import Mapping, Sequence
+
+import cobra
+from cobra import Metabolite, Reaction
+from cobra.core.gene import GPR
+
+from raven_python.utils.parse import parse_name_comp
+
+# Reversibility arrows. ``<=>`` must be tried before ``=>`` (it contains it).
+_REVERSIBLE_ARROWS = ("<=>",)
+_FORWARD_ARROWS = ("-->", "->", "=>")
+
+
+def _split_equation(equation: str) -> tuple[str, str, bool]:
+    """Split an equation into (lhs, rhs, reversible) on its arrow."""
+    for arrow in _REVERSIBLE_ARROWS:
+        if arrow in equation:
+            lhs, rhs = equation.split(arrow, 1)
+            return lhs, rhs, True
+    for arrow in _FORWARD_ARROWS:
+        if arrow in equation:
+            lhs, rhs = equation.split(arrow, 1)
+            return lhs, rhs, False
+    raise ValueError(f"No reaction arrow (<=>, -->, =>) found in equation: {equation!r}")
+
+
+def _parse_side(side: str) -> list[tuple[float, str, str | None]]:
+    """Parse one side of an equation into ``[(coefficient, token, fallback), ...]``.
+
+    The ``fallback`` slot is for the ambiguous ``"<number> <rest>"`` shape: when
+    matching by name, ``"2 oxoglutarate"`` could be either ``coeff=2, name="oxoglutarate"``
+    or ``coeff=1, name="2 oxoglutarate"`` (a real chemistry name). We return the
+    coefficient-split form as the primary and the full term as the fallback; the
+    resolver picks whichever matches an existing metabolite. Pure-number heads
+    with no name (``"2"``) and pure-name terms (``"glucose"``) have no fallback.
+    """
+    terms: list[tuple[float, str, str | None]] = []
+    for raw in side.split(" + "):
+        term = raw.strip()
+        if not term:
+            continue
+        head, _, tail = term.partition(" ")
+        try:
+            coeff = float(head)
+            token = tail.strip()
+        except ValueError:
+            coeff, token = 1.0, term
+            fallback = None
+        else:
+            # Coefficient-split succeeded. Keep the full term as a fallback when
+            # the tail is non-empty so name-resolution can re-try it as one token.
+            fallback = term if token else None
+        if not token:
+            raise ValueError(f"Missing metabolite after coefficient in term: {raw!r}")
+        terms.append((coeff, token, fallback))
+    return terms
+
+
+def _new_met_id(model: cobra.Model, prefix: str) -> str:
+    """Next free ``<prefix><int>`` metabolite ID (RAVEN m1, m2, ... scheme)."""
+    pattern = re.compile(rf"^{re.escape(prefix)}(\d+)$")
+    used = [int(m.group(1)) for met in model.metabolites if (m := pattern.match(met.id))]
+    n = max(used) + 1 if used else 1
+    while f"{prefix}{n}" in model.metabolites:
+        n += 1
+    return f"{prefix}{n}"
+
+
+def _try_existing(
+    model: cobra.Model, token: str, *, mets_by: str, compartment: str | None
+) -> Metabolite | None:
+    """Look up ``token`` as an existing metabolite (no creation, no side effects).
+
+    Returns the matching metabolite or ``None``. Used by ``_stoichiometry`` to
+    disambiguate the ``"<number> <rest>"`` shape: if a metabolite whose *name*
+    (or id) literally contains a leading number exists, prefer it over splitting
+    the number off as a coefficient.
+    """
+    name, comp = parse_name_comp(token)
+    if mets_by == "id" and comp is None:
+        return model.metabolites.get_by_id(token) if token in model.metabolites else None
+    target_comp = comp if comp is not None else compartment
+    if target_comp is None:
+        return None
+    for met in model.metabolites:
+        if met.name == name and met.compartment == target_comp:
+            return met
+    return None
+
+
+def _resolve_metabolite(
+    model: cobra.Model,
+    token: str,
+    *,
+    mets_by: str,
+    compartment: str | None,
+    allow_new_mets: bool,
+    new_met_prefix: str,
+) -> Metabolite:
+    """Resolve an equation token to an existing or newly created Metabolite."""
+    name, comp = parse_name_comp(token)
+
+    if mets_by == "id" and comp is None:
+        # token is a metabolite ID
+        if token in model.metabolites:
+            return model.metabolites.get_by_id(token)
+        if not allow_new_mets:
+            raise ValueError(
+                f"Unknown metabolite ID {token!r}; pass allow_new_mets=True to create it."
+            )
+        if compartment is None:
+            raise ValueError(
+                f"Cannot create metabolite {token!r}: no compartment given."
+            )
+        _warn_unknown_compartment(model, compartment, token)
+        met = Metabolite(token, compartment=compartment)
+        model.add_metabolites([met])
+        return met
+
+    # name-based (mets_by="name") or explicit name[comp]
+    target_comp = comp if comp is not None else compartment
+    if target_comp is None:
+        raise ValueError(
+            f"Metabolite {token!r} matched by name needs a compartment; "
+            "pass compartment=... or use the name[comp] syntax."
+        )
+    if comp is not None and target_comp not in model.compartments and not allow_new_mets:
+        raise ValueError(f"Compartment {target_comp!r} is not in the model.")
+
+    matches = [
+        met
+        for met in model.metabolites
+        if met.name == name and met.compartment == target_comp
+    ]
+    if matches:
+        return matches[0]
+    if not allow_new_mets:
+        raise ValueError(
+            f"No metabolite named {name!r} in compartment {target_comp!r}; "
+            "pass allow_new_mets=True to create it."
+        )
+    _warn_unknown_compartment(model, target_comp, name)
+    met = Metabolite(_new_met_id(model, new_met_prefix), name=name, compartment=target_comp)
+    model.add_metabolites([met])
+    return met
+
+
+def _warn_unknown_compartment(model: cobra.Model, compartment: str, identifier: str) -> None:
+    """Warn when a new metabolite would be born into a not-yet-registered compartment.
+
+    Both ``mets_by`` paths previously created the metabolite without validating
+    the compartment, so a typo (``"cyto"`` for ``"c"``) silently produced a
+    one-metabolite ghost compartment. cobra inherits the compartment from the
+    first metabolite assigned to it, so the fix is a warning, not a hard error.
+    """
+    known = set(model.compartments) | set(model._compartments)
+    if compartment not in known:
+        warnings.warn(
+            f"Creating metabolite {identifier!r} in unregistered compartment "
+            f"{compartment!r} (existing: {sorted(known) or 'none'}); "
+            "add the compartment first or check for a typo.",
+            stacklevel=5,
+        )
+
+
+def _stoichiometry(
+    model: cobra.Model,
+    equation: str,
+    *,
+    mets_by: str,
+    compartment: str | None,
+    allow_new_mets: bool,
+    new_met_prefix: str,
+) -> tuple[dict[Metabolite, float], bool]:
+    """Parse an equation into a {Metabolite: net coefficient} dict + reversibility."""
+    lhs, rhs, reversible = _split_equation(equation)
+    coeffs: OrderedDict[Metabolite, float] = OrderedDict()
+    had_terms = False
+    for sign, side in ((-1.0, lhs), (1.0, rhs)):
+        for coeff, token, fallback in _parse_side(side):
+            had_terms = True
+            # "<number> <name>" is ambiguous when the name itself starts with a
+            # number (e.g. "2 oxoglutarate"). Prefer the full-term interpretation
+            # when it matches an existing metabolite — otherwise fall through to
+            # the coefficient-split form.
+            met = None
+            if fallback is not None:
+                met = _try_existing(
+                    model, fallback, mets_by=mets_by, compartment=compartment
+                )
+                if met is not None:
+                    coeff = 1.0
+            if met is None:
+                met = _resolve_metabolite(
+                    model,
+                    token,
+                    mets_by=mets_by,
+                    compartment=compartment,
+                    allow_new_mets=allow_new_mets,
+                    new_met_prefix=new_met_prefix,
+                )
+            coeffs[met] = coeffs.get(met, 0.0) + sign * coeff
+    # Drop metabolites that net to zero (present as both substrate and product).
+    coeffs = OrderedDict((met, c) for met, c in coeffs.items() if c != 0.0)
+    if had_terms and not coeffs:
+        warnings.warn(
+            f"Equation {equation!r} has no net metabolites (all terms cancelled); "
+            "the reaction will be added with empty stoichiometry.",
+            stacklevel=4,
+        )
+    return dict(coeffs), reversible
+
+
+def add_reactions_from_equations(
+    model: cobra.Model,
+    reactions: Sequence[Mapping],
+    *,
+    mets_by: str = "id",
+    compartment: str | None = None,
+    allow_new_mets: bool = True,
+    allow_new_genes: bool = True,
+    new_met_prefix: str = "m",
+) -> list[Reaction]:
+    """Add reactions defined by equation strings, matching mets by ID or name.
+    Parameters
+    ----------
+    model
+        Target ``cobra.Model``, mutated in place.
+    reactions
+        Sequence of mappings, one per reaction. Recognised keys:
+
+        * ``id`` (**required**) — reaction ID; must not already exist.
+        * ``equation`` (**required**) — e.g. ``"atp_c + h2o_c <=> adp_c + pi_c"``.
+          Use ``<=>`` for reversible, ``-->``/``->``/``=>`` for irreversible.
+        * ``name`` — reaction name.
+        * ``bounds`` — ``(lower, upper)`` tuple; overrides the arrow.
+        * ``gene_reaction_rule`` — GPR string.
+        * ``subsystem`` — subsystem name.
+    mets_by
+        How bare equation tokens (without ``[comp]``) are matched:
+        ``"id"`` (RAVEN eqnType 1) or ``"name"`` (eqnType 2). A ``name[comp]``
+        token (eqnType 3) is always matched by name + compartment.
+    compartment
+        Default compartment for new metabolites and for name-matched tokens
+        without an explicit ``[comp]``.
+    allow_new_mets
+        If True (default), create metabolites not found. New metabolites get
+        ``compartment`` (id mode) or an auto ID ``m1``, ``m2``, ... (name mode).
+        If False, an unknown metabolite raises.
+    allow_new_genes
+        If True (default), genes in a GPR are auto-created by cobra. If False,
+        a GPR referencing a gene not already in the model raises.
+    new_met_prefix
+        Prefix for auto-generated metabolite IDs in name mode (default ``"m"``).
+
+    Returns
+    -------
+    list of cobra.Reaction
+        The reactions added, in input order.
+    """
+    if mets_by not in ("id", "name"):
+        raise ValueError(f"mets_by must be 'id' or 'name', got {mets_by!r}")
+
+    known_genes = {gene.id for gene in model.genes}
+    added: list[Reaction] = []
+
+    for spec in reactions:
+        if "id" not in spec:
+            raise ValueError(f"Reaction spec missing required 'id': {spec!r}")
+        rxn_id = spec["id"]
+        if rxn_id in model.reactions:
+            raise ValueError(
+                f"Reaction {rxn_id!r} already exists; use changeRxns or remove it first."
+            )
+        if "equation" not in spec:
+            raise ValueError(f"Reaction {rxn_id!r} spec missing required 'equation'.")
+
+        coeffs, reversible = _stoichiometry(
+            model,
+            spec["equation"],
+            mets_by=mets_by,
+            compartment=compartment,
+            allow_new_mets=allow_new_mets,
+            new_met_prefix=new_met_prefix,
+        )
+
+        rxn = Reaction(rxn_id, name=spec.get("name", ""))
+        if "bounds" in spec:
+            rxn.bounds = tuple(spec["bounds"])
+        else:
+            config = cobra.Configuration()
+            lower = config.lower_bound if reversible else 0.0
+            rxn.bounds = (lower, config.upper_bound)
+        if "subsystem" in spec:
+            rxn.subsystem = spec["subsystem"]
+
+        model.add_reactions([rxn])
+        rxn.add_metabolites(coeffs)
+
+        rule = spec.get("gene_reaction_rule", "")
+        if rule:
+            if not allow_new_genes:
+                missing = sorted(set(GPR.from_string(rule).genes) - known_genes)
+                if missing:
+                    raise ValueError(
+                        f"Reaction {rxn_id!r} references genes not in the model: "
+                        f"{missing}. Set allow_new_genes=True or add them first."
+                    )
+            rxn.gene_reaction_rule = rule
+            known_genes.update(gene.id for gene in rxn.genes)
+
+        added.append(rxn)
+
+    return added
diff --git a/src/raven_python/manipulation/change.py b/src/raven_python/manipulation/change.py
new file mode 100644
index 0000000..78612ba
--- /dev/null
+++ b/src/raven_python/manipulation/change.py
@@ -0,0 +1,125 @@
+"""Change the stoichiometry of existing reactions from equation strings.
+
+Editing the same ``Reaction`` object changes only its stoichiometry — its id, name,
+bounds, GPR, subsystem, and position are preserved automatically by cobra.
+
+So this port simply re-parses the equation (reusing the same metabolite
+matching as :func:`~raven_python.manipulation.add.add_reactions_from_equations`,
+including name and ``name[comp]`` modes that cobra lacks) and swaps the
+metabolites in place.
+
+Like RAVEN, **bounds are left unchanged** even if the new equation's arrow
+implies a different reversibility — use a bounds setter for that.
+"""
+from __future__ import annotations
+
+from collections.abc import Mapping
+
+import cobra
+from cobra import Reaction
+
+from raven_python.manipulation.add import _stoichiometry
+
+__all__ = ["change_reaction_equations", "change_gene_reaction_rules"]
+
+
+def change_reaction_equations(
+    model: cobra.Model,
+    equations: Mapping[str, str],
+    *,
+    mets_by: str = "id",
+    compartment: str | None = None,
+    allow_new_mets: bool = True,
+    new_met_prefix: str = "m",
+) -> list[Reaction]:
+    """Replace the stoichiometry of existing reactions.
+    Parameters
+    ----------
+    model
+        Target ``cobra.Model``, mutated in place.
+    equations
+        Mapping of ``reaction_id -> equation string``. Every ID must already
+        exist in the model. Equation syntax is identical to
+        :func:`~raven_python.manipulation.add.add_reactions_from_equations`.
+    mets_by, compartment, allow_new_mets, new_met_prefix
+        Metabolite-matching options, as in ``add_reactions_from_equations``.
+
+    Returns
+    -------
+    list of cobra.Reaction
+        The reactions changed, in input order.
+
+    Notes
+    -----
+    Bounds are **not** modified, matching RAVEN. Changing an equation from
+    ``-->`` to ``<=>`` does not by itself make the reaction reversible; adjust
+    the bounds separately.
+    """
+    if mets_by not in ("id", "name"):
+        raise ValueError(f"mets_by must be 'id' or 'name', got {mets_by!r}")
+
+    changed: list[Reaction] = []
+    for rxn_id, equation in equations.items():
+        if rxn_id not in model.reactions:
+            raise ValueError(f"Reaction {rxn_id!r} not found in the model.")
+        rxn = model.reactions.get_by_id(rxn_id)
+
+        coeffs, _reversible = _stoichiometry(
+            model,
+            equation,
+            mets_by=mets_by,
+            compartment=compartment,
+            allow_new_mets=allow_new_mets,
+            new_met_prefix=new_met_prefix,
+        )
+
+        rxn.subtract_metabolites(dict(rxn.metabolites), combine=True)
+        rxn.add_metabolites(coeffs)
+        changed.append(rxn)
+
+    return changed
+
+
+def change_gene_reaction_rules(
+    model: cobra.Model,
+    rules: Mapping[str, str],
+    *,
+    replace: bool = True,
+) -> list[Reaction]:
+    """Set or append gene-reaction rules on existing reactions.
+    cobra already does the heavy lifting on assignment to
+    ``reaction.gene_reaction_rule``: it auto-creates any new ``Gene`` objects and
+    normalises the rule. So the value here is batching plus RAVEN's ``replace``
+    option to **append** rather than overwrite.
+
+    Parameters
+    ----------
+    model
+        Target ``cobra.Model``, mutated in place.
+    rules
+        Mapping of ``reaction_id -> GPR string``. Every ID must already exist.
+    replace
+        If True (default), overwrite the existing GPR. If False, append the new
+        rule as an isozyme: ``(old) or (new)`` (just ``new`` if the reaction had
+        no GPR).
+
+    Returns
+    -------
+    list of cobra.Reaction
+        The reactions changed, in input order.
+    """
+    changed: list[Reaction] = []
+    for rxn_id, rule in rules.items():
+        if rxn_id not in model.reactions:
+            raise ValueError(f"Reaction {rxn_id!r} not found in the model.")
+        rxn = model.reactions.get_by_id(rxn_id)
+
+        if replace or not rxn.gene_reaction_rule:
+            new_rule = rule
+        else:
+            new_rule = f"({rxn.gene_reaction_rule}) or ({rule})"
+
+        rxn.gene_reaction_rule = new_rule  # cobra creates genes + normalises
+        changed.append(rxn)
+
+    return changed
diff --git a/src/raven_python/manipulation/compartments.py b/src/raven_python/manipulation/compartments.py
new file mode 100644
index 0000000..091d196
--- /dev/null
+++ b/src/raven_python/manipulation/compartments.py
@@ -0,0 +1,196 @@
+"""Compartment manipulation — merge all compartments into one, or copy reactions to a
+new compartment (ports of RAVEN's ``mergeCompartments`` and ``copyToComps``).
+
+Both functions are useful **independently of** :func:`raven_python.localization.predict_localization`:
+``merge_compartments`` flattens a multi-compartment model for a simplified analysis
+(e.g. checking whether the network can in principle make a metabolite, with no
+compartment topology in the way); ``copy_to_compartment`` is a building block for
+constructing dual-localised pathways. cobra has no equivalents.
+"""
+from __future__ import annotations
+
+from collections.abc import Iterable
+
+import cobra
+
+# Compartments produced by merge_compartments (RAVEN uses 's' for "system").
+_MERGED_COMPARTMENT = "s"
+
+
+def merge_compartments(
+    model: cobra.Model,
+    *,
+    merged_id: str = _MERGED_COMPARTMENT,
+    merged_name: str = "system",
+    drop_single_metabolite_reactions: bool = True,
+    deduplicate_reactions: bool = True,
+) -> tuple[cobra.Model, list[str], list[str]]:
+    """Merge every metabolite of ``model`` into one ``merged_id`` compartment.
+
+    Returns ``(model_copy, deleted_single_met_reactions, deduplicated_reactions)``. The
+    returned model is a deep copy of the input. Use cases:
+
+    * Check whether the network can produce/consume a metabolite at all (compartment
+      topology is often what makes a model look blocked).
+    * Simplify a model for visualisation or an analysis that doesn't care about
+      compartments.
+    * As a pre-step for localisation when the user does want RAVEN's
+      "start from scratch" workflow (call :func:`merge_compartments` then
+      :func:`raven_python.localization.predict_localization` with the full reaction list).
+
+    Metabolites that already share a base id (e.g. ``glc__D_c`` and ``glc__D_e`` both
+    map to ``glc__D``) collapse into one entity in the merged compartment; their
+    stoichiometric contributions are summed per reaction. Reactions that end up with
+    only one metabolite (e.g. ``A[c] → A[m]`` becomes ``A → A`` = nothing) are deleted
+    by default (RAVEN's ``deleteRxnsWithOneMet``). Reactions that become identical
+    after merging are deduplicated (one survives).
+    """
+    out = model.copy()
+
+    # 1. For each metabolite, derive a base id (strip the trailing _<compartment>).
+    #    Two mets in different compartments sharing the base id collapse to one.
+    new_to_old: dict[str, list[cobra.Metabolite]] = {}
+    for m in list(out.metabolites):
+        base = _base_id(m)
+        new_to_old.setdefault(base, []).append(m)
+
+    # 2. Build the merged metabolites and rewrite reactions.
+    canonical: dict[str, cobra.Metabolite] = {}
+    for base, mets in new_to_old.items():
+        proto = mets[0]
+        new_met = cobra.Metabolite(base, name=proto.name, compartment=merged_id,
+                                    formula=proto.formula, charge=proto.charge)
+        new_met.notes = dict(proto.notes or {})
+        canonical[base] = new_met
+
+    # Rewrite all reactions: replace each metabolite with its canonical, summing
+    # coefficients where multiple original mets collapse to one.
+    rewritten: dict[str, dict[str, float]] = {}
+    for r in list(out.reactions):
+        new_stoich: dict[cobra.Metabolite, float] = {}
+        for m, coeff in list(r.metabolites.items()):
+            canon = canonical[_base_id(m)]
+            new_stoich[canon] = new_stoich.get(canon, 0.0) + coeff
+        # Drop zero net coefficients (substrate + product of the same base met cancel).
+        new_stoich = {m: c for m, c in new_stoich.items() if c != 0.0}
+        rewritten[r.id] = {m.id: c for m, c in new_stoich.items()}
+
+    # Now build a fresh model with the canonical mets + rewritten reactions; the
+    # cobra in-place rewrite would require careful constraint surgery, so a clean
+    # rebuild is simpler and less error-prone.
+    merged = cobra.Model(out.id or "merged")
+    merged.compartments = {merged_id: merged_name}
+    merged.add_metabolites(list(canonical.values()))
+    deleted_single: list[str] = []
+    deduplicated: list[str] = []
+    seen_signatures: dict[tuple, str] = {}
+    keep_reactions: list[cobra.Reaction] = []
+    for r in out.reactions:
+        stoich = rewritten[r.id]
+        if drop_single_metabolite_reactions and len(stoich) <= 1:
+            deleted_single.append(r.id)
+            continue
+        if not stoich:  # everything cancelled
+            deleted_single.append(r.id)
+            continue
+        sig = (frozenset(stoich.items()), bool(r.lower_bound < 0), bool(r.upper_bound > 0))
+        if deduplicate_reactions and sig in seen_signatures:
+            deduplicated.append(r.id)
+            continue
+        seen_signatures[sig] = r.id
+        new_r = cobra.Reaction(r.id, name=r.name, lower_bound=r.lower_bound,
+                                upper_bound=r.upper_bound)
+        new_r.add_metabolites({merged.metabolites.get_by_id(mid): c for mid, c in stoich.items()})
+        new_r.gene_reaction_rule = r.gene_reaction_rule
+        if r.subsystem:
+            new_r.subsystem = r.subsystem
+        new_r.notes = dict(r.notes or {})
+        keep_reactions.append(new_r)
+    merged.add_reactions(keep_reactions)
+    return merged, deleted_single, deduplicated
+
+
+def copy_to_compartment(
+    model: cobra.Model,
+    reactions: Iterable[str],
+    target_compartment: str,
+    *,
+    target_compartment_name: str | None = None,
+    delete_original: bool = False,
+    id_suffix: str | None = None,
+) -> tuple[cobra.Model, list[str], list[str]]:
+    """Copy a set of reactions into ``target_compartment``. RAVEN's ``copyToComps``.
+
+    Returns ``(model_copy, new_reaction_ids, new_metabolite_ids)``. Use cases:
+
+    * Build a dual-localised pathway (e.g. duplicate glycolysis into a peroxisome).
+    * Mirror a curated subsystem into an additional compartment as a draft to refine.
+    * Set up the input for a flux comparison between alternate compartmentalisations.
+
+    Each copied reaction is given the id ``"<orig_id>_<id_suffix>"`` (default
+    ``id_suffix=target_compartment``); each metabolite it touches is mapped to (or
+    created in) ``target_compartment`` with the same suffix convention. ``delete_original=True``
+    moves the reactions instead of copying.
+    """
+    out = model.copy()
+    suffix = id_suffix if id_suffix is not None else target_compartment
+    if target_compartment not in out.compartments:
+        out.compartments = {**out.compartments,
+                             target_compartment: target_compartment_name or target_compartment}
+
+    preexisting_met_ids = {x.id for x in out.metabolites}
+    new_rxn_ids: list[str] = []
+    for rid in list(reactions):
+        if rid not in out.reactions:
+            raise ValueError(f"reaction {rid!r} not in model")
+        src = out.reactions.get_by_id(rid)
+        new_id = f"{rid}_{suffix}"
+        if new_id in out.reactions:
+            continue  # already copied; idempotent
+        new_stoich: dict[cobra.Metabolite, float] = {}
+        for m, coeff in src.metabolites.items():
+            target_met = _met_in_compartment(out, m, target_compartment, suffix=suffix)
+            new_stoich[target_met] = coeff
+        new_r = cobra.Reaction(new_id, name=src.name,
+                                lower_bound=src.lower_bound, upper_bound=src.upper_bound)
+        new_r.add_metabolites(new_stoich)
+        new_r.gene_reaction_rule = src.gene_reaction_rule
+        if src.subsystem:
+            new_r.subsystem = src.subsystem
+        new_r.notes = dict(src.notes or {})
+        out.add_reactions([new_r])
+        new_rxn_ids.append(new_id)
+        if delete_original:
+            out.remove_reactions([src.id], remove_orphans=False)
+
+    new_met_ids = [m.id for m in out.metabolites if m.id not in preexisting_met_ids]
+    return out, new_rxn_ids, new_met_ids
+
+
+# ----------------------------------------------------------------- helpers
+
+def _base_id(m: cobra.Metabolite) -> str:
+    """Strip the trailing ``_<compartment>`` suffix from a metabolite id (if present)."""
+    if m.compartment and m.id.endswith(f"_{m.compartment}"):
+        return m.id[: -(len(m.compartment) + 1)]
+    return m.id
+
+
+def _met_in_compartment(model: cobra.Model, source: cobra.Metabolite,
+                        compartment: str, *, suffix: str | None = None) -> cobra.Metabolite:
+    """Return (creating if needed) the copy of ``source`` in ``compartment``.
+
+    The new metabolite id is ``"<base>_<suffix>"`` (default ``suffix=compartment``).
+    Already-existing copies are reused.
+    """
+    if source.compartment == compartment:
+        return source
+    base = _base_id(source)
+    new_id = f"{base}_{suffix if suffix is not None else compartment}"
+    if new_id in model.metabolites:
+        return model.metabolites.get_by_id(new_id)
+    new_met = cobra.Metabolite(new_id, name=source.name, compartment=compartment,
+                                formula=source.formula, charge=source.charge)
+    new_met.notes = dict(source.notes or {})
+    model.add_metabolites([new_met])
+    return new_met
diff --git a/src/raven_python/manipulation/expand.py b/src/raven_python/manipulation/expand.py
new file mode 100644
index 0000000..246f3b9
--- /dev/null
+++ b/src/raven_python/manipulation/expand.py
@@ -0,0 +1,124 @@
+"""Expand reactions with isozymes into one reaction per isozyme.
+
+Operates on cobra's GPR AST, so the model stays a plain ``cobra.Model`` throughout.
+
+Provenance: this implementation was first written for geckopy
+(``geckopy/ec_model/pipeline/expand.py``, where it backed makeEcModel stage 5)
+and is adopted here as its canonical home; geckopy will import it from raven_python
+once raven_python is published.
+
+MATLAB-COMPAT: GECKO MATLAB and RAVEN ``expandModel.m`` use string manipulation
+on grRules to detect and split isozymes. raven_python uses cobrapy's GPR AST
+instead. Output should be equivalent for any well-formed GPR; cases that differ
+are likely malformed GPR strings that the AST flags as invalid.
+"""
+from __future__ import annotations
+
+import ast
+import copy
+
+import cobra
+from cobra.core.gene import GPR
+
+
+def _gpr_to_dnf(gpr: GPR) -> list[list[str]]:
+    """Convert a GPR to disjunctive normal form (list of AND-clauses).
+
+    An empty GPR yields an empty list. A single clause (no OR anywhere)
+    yields a list of length 1. OR-of-ANDs yields one sublist per
+    disjunct, each containing the gene names ANDed together.
+
+    Handles distributivity: ``g1 and (g2 or g3)`` becomes
+    ``[[g1, g2], [g1, g3]]``.
+    """
+    if gpr is None or gpr.body is None:
+        return []
+    return _node_to_dnf(gpr.body)
+
+
+def _node_to_dnf(node) -> list[list[str]]:
+    """Recursive helper. Returns DNF as list of AND-clauses."""
+    if isinstance(node, ast.Name):
+        return [[node.id]]
+    if isinstance(node, ast.BoolOp):
+        if isinstance(node.op, ast.Or):
+            result: list[list[str]] = []
+            for child in node.values:
+                result.extend(_node_to_dnf(child))
+            return result
+        if isinstance(node.op, ast.And):
+            clauses: list[list[str]] = [[]]
+            for child in node.values:
+                child_dnf = _node_to_dnf(child)
+                new_clauses: list[list[str]] = []
+                for existing in clauses:
+                    for extra in child_dnf:
+                        new_clauses.append(existing + extra)
+                clauses = new_clauses
+            return clauses
+    raise ValueError(f"Unexpected GPR node type: {type(node).__name__}")
+
+
+def expand_model(model: cobra.Model) -> list[str]:
+    """Split reactions with isozymes (OR in GPR) into one reaction per isozyme.
+    For each reaction whose GPR contains at least one OR, the reaction
+    is removed and replaced by one copy per disjunctive clause. The new
+    reactions get ID suffix ``_EXP_1``, ``_EXP_2``, etc. All other
+    fields (stoichiometry, bounds, name, subsystem) are copied verbatim;
+    only the GPR is simplified to the single AND-clause for that
+    isozyme.
+
+    Reactions with no GPR, or with a GPR that has no OR, are left
+    untouched.
+
+    Parameters
+    ----------
+    model
+        A cobra.Model, mutated in place.
+
+    Returns
+    -------
+    list of str
+        Sorted IDs of newly added expanded reactions (those with
+        ``_EXP_N`` suffixes). The original reactions that were split
+        are no longer in the model.
+    """
+    expansions: list[tuple[cobra.Reaction, list[list[str]]]] = []
+
+    for rxn in model.reactions:
+        if not rxn.gene_reaction_rule:
+            continue
+        clauses = _gpr_to_dnf(rxn.gpr)
+        if len(clauses) <= 1:
+            continue
+        expansions.append((rxn, clauses))
+
+    added_ids: list[str] = []
+    for original_rxn, clauses in expansions:
+        new_rxns: list[cobra.Reaction] = []
+        for i, clause in enumerate(clauses, start=1):
+            new_rxn = cobra.Reaction(
+                id=f"{original_rxn.id}_EXP_{i}",
+                name=original_rxn.name,
+            )
+            new_rxn.lower_bound = original_rxn.lower_bound
+            new_rxn.upper_bound = original_rxn.upper_bound
+            new_rxn.add_metabolites(dict(original_rxn.metabolites.items()))
+            new_rxn.subsystem = original_rxn.subsystem
+            new_rxn.gene_reaction_rule = " and ".join(clause)
+            # Propagate per-reaction metadata (notably ec-code / annotations)
+            # so downstream functions see the same annotations on expanded
+            # reactions as on the original. Deep-copy so siblings are independent.
+            new_rxn.annotation = copy.deepcopy(original_rxn.annotation)
+            new_rxn.notes = copy.deepcopy(original_rxn.notes)
+            new_rxns.append(new_rxn)
+
+        obj_coeff = original_rxn.objective_coefficient
+        model.remove_reactions([original_rxn])
+        model.add_reactions(new_rxns)
+        if obj_coeff:  # keep the original in the objective — sum over its isozyme copies
+            for new_rxn in new_rxns:
+                new_rxn.objective_coefficient = obj_coeff
+        added_ids.extend(r.id for r in new_rxns)
+
+    return sorted(added_ids)
diff --git a/src/raven_python/manipulation/irreversible.py b/src/raven_python/manipulation/irreversible.py
new file mode 100644
index 0000000..3f64a68
--- /dev/null
+++ b/src/raven_python/manipulation/irreversible.py
@@ -0,0 +1,72 @@
+"""Convert reversible reactions to an irreversible (forward + reverse) form.
+
+cobrapy's own ``convert_to_irreversible`` was removed, so this is a genuine
+implementation rather than a wrapper.
+
+Provenance: first written for geckopy
+(``geckopy/ec_model/pipeline/preprocess.py``, makeEcModel stage 4, tagged
+"RAVENpy candidate") and adopted here as its canonical home; geckopy will
+import it from raven_python once raven_python is published.
+"""
+from __future__ import annotations
+
+import cobra
+
+
+def convert_to_irreversible(model: cobra.Model) -> list[str]:
+    """Split non-exchange reversible reactions into a forward + reverse pair.
+    For each non-exchange reaction with ``lb < 0``:
+
+    - The original reaction is kept as the forward direction. Its
+      lower bound is clamped to 0.
+    - A new reaction with the same ID plus a ``_REV`` suffix is added,
+      representing the reverse direction. Its stoichiometry is the
+      negation of the original, its bounds are ``(0, -original_lb)``,
+      and it inherits the name (with " (reversible)" appended) and the
+      gene-protein rule of the original.
+
+    Exchange reactions (boundary reactions) are never split, regardless
+    of their bounds, matching MATLAB behavior where exchange reactions
+    are explicitly excluded from ``convertToIrrev``.
+
+    Parameters
+    ----------
+    model
+        A cobra.Model, mutated in place.
+
+    Returns
+    -------
+    list of str
+        Sorted IDs of newly added reverse reactions (the ones ending in
+        ``_REV``). The forward reactions retain their original IDs.
+    """
+    reverse_rxns_to_add: list[cobra.Reaction] = []
+    forward_updates: list[cobra.Reaction] = []
+
+    for rxn in model.reactions:
+        if rxn.boundary:
+            continue
+        if rxn.lower_bound >= 0:
+            continue
+
+        original_lb = rxn.lower_bound
+
+        rev_rxn = cobra.Reaction(
+            id=f"{rxn.id}_REV",
+            name=(f"{rxn.name} (reversible)" if rxn.name else f"{rxn.id}_REV"),
+        )
+        rev_rxn.lower_bound = 0.0
+        rev_rxn.upper_bound = -original_lb
+        rev_rxn.add_metabolites({m: -c for m, c in rxn.metabolites.items()})
+        rev_rxn.gene_reaction_rule = rxn.gene_reaction_rule
+
+        reverse_rxns_to_add.append(rev_rxn)
+        forward_updates.append(rxn)
+
+    for rxn in forward_updates:
+        rxn.lower_bound = 0.0
+
+    if reverse_rxns_to_add:
+        model.add_reactions(reverse_rxns_to_add)
+
+    return sorted(r.id for r in reverse_rxns_to_add)
diff --git a/src/raven_python/manipulation/merge.py b/src/raven_python/manipulation/merge.py
new file mode 100644
index 0000000..bfa1f24
--- /dev/null
+++ b/src/raven_python/manipulation/merge.py
@@ -0,0 +1,146 @@
+"""Merge several models into one.
+
+cobra's ``Model.merge`` is pairwise and matches everything strictly by id; this
+merges **N** models and unifies metabolites by **name[compartment]** (so the same
+compound under different ids in two models becomes one), while adding **all**
+reactions without de-duplication
+(a reaction whose ID already exists is renamed ``id_<sourceid>``). Genes are
+unified by ID. Provenance (which source model each object came from) is recorded
+in ``notes['origin']``.
+
+The bulk of RAVEN's function is struct field-padding and manual S-matrix
+assembly, none of which is needed on ``cobra.Model``.
+"""
+from __future__ import annotations
+
+import copy
+import warnings
+from collections.abc import Iterable
+
+import cobra
+from cobra import Metabolite, Model, Reaction
+
+
+def _unique_id(existing, base: str, suffix: str) -> str:
+    """Return base, or base_suffix (then base_suffix_2, ...) if it collides."""
+    if base not in existing:
+        return base
+    candidate = f"{base}_{suffix}"
+    n = 2
+    while candidate in existing:
+        candidate = f"{base}_{suffix}_{n}"
+        n += 1
+    return candidate
+
+
+def merge_models(
+    models: Iterable[cobra.Model],
+    *,
+    match_by: str = "name",
+    track_origin: bool = True,
+) -> cobra.Model:
+    """Merge models into a single new model.
+    Parameters
+    ----------
+    models
+        The models to merge (two or more). A single model is returned as a copy.
+    match_by
+        How metabolites are unified across models: ``"name"`` (default) treats
+        metabolites with the same *name and compartment* as identical (IDs
+        ignored); ``"id"`` matches by metabolite ID.
+    track_origin
+        If True (default), record the source model's ``id`` in each reaction's,
+        metabolite's, and gene's ``notes['origin']``.
+
+    Returns
+    -------
+    cobra.Model
+        A new merged model (``id="MERGED"``). Reactions are **not** de-duplicated
+        — matching RAVEN, every reaction from every model is kept, with ID
+        collisions renamed ``id_<sourceid>``.
+    """
+    models = list(models)
+    if not models:
+        raise ValueError("merge_models requires at least one model.")
+    if match_by not in ("name", "id"):
+        raise ValueError(f"match_by must be 'name' or 'id', got {match_by!r}")
+    if len(models) == 1:
+        return models[0].copy()
+
+    merged = Model("MERGED")
+    comp_names: dict[str, str] = {}
+    met_lookup: dict = {}  # name/comp or id key -> merged Metabolite
+
+    def met_key(met: Metabolite):
+        return (met.name, met.compartment) if match_by == "name" else met.id
+
+    def ensure_metabolite(src: Metabolite, origin: str) -> Metabolite:
+        key = met_key(src)
+        if key in met_lookup:
+            existing = met_lookup[key]
+            # Two source models can map to the same name[comp] (or id) with
+            # different formula/charge; silently picking the first-seen has
+            # quietly corrupted mass balance in the past. Warn so the caller
+            # sees the conflict.
+            if src.formula and existing.formula and src.formula != existing.formula:
+                warnings.warn(
+                    f"merge_models: metabolite {existing.id!r} (from earlier model) "
+                    f"and {src.id!r} (from {origin!r}) share key {key!r} but "
+                    f"have different formulas ({existing.formula!r} vs {src.formula!r}); "
+                    "keeping the first.",
+                    stacklevel=3,
+                )
+            if (
+                existing.charge is not None
+                and src.charge is not None
+                and existing.charge != src.charge
+            ):
+                warnings.warn(
+                    f"merge_models: metabolite {existing.id!r} (from earlier model) "
+                    f"and {src.id!r} (from {origin!r}) share key {key!r} but "
+                    f"have different charges ({existing.charge} vs {src.charge}); "
+                    "keeping the first.",
+                    stacklevel=3,
+                )
+            return existing
+        new_id = _unique_id(merged.metabolites, src.id, origin)
+        new_met = Metabolite(
+            new_id, name=src.name, compartment=src.compartment,
+            formula=src.formula, charge=src.charge,
+        )
+        new_met.annotation = copy.deepcopy(src.annotation)
+        new_met.notes = copy.deepcopy(src.notes)
+        if track_origin:
+            new_met.notes.setdefault("origin", origin)
+        merged.add_metabolites([new_met])
+        met_lookup[key] = new_met
+        return new_met
+
+    for model in models:
+        origin = model.id or "model"
+        comp_names.update(model.compartments)
+        genes_before = {g.id for g in merged.genes}
+
+        for rxn in model.reactions:
+            new_id = _unique_id(merged.reactions, rxn.id, origin)
+            new_rxn = Reaction(new_id, name=rxn.name)
+            new_rxn.bounds = rxn.bounds
+            new_rxn.subsystem = rxn.subsystem
+            merged.add_reactions([new_rxn])
+            new_rxn.add_metabolites(
+                {ensure_metabolite(m, origin): coef for m, coef in rxn.metabolites.items()}
+            )
+            if rxn.gene_reaction_rule:
+                new_rxn.gene_reaction_rule = rxn.gene_reaction_rule
+            new_rxn.annotation = copy.deepcopy(rxn.annotation)
+            new_rxn.notes = copy.deepcopy(rxn.notes)
+            if track_origin:
+                new_rxn.notes.setdefault("origin", origin)
+
+        if track_origin:
+            for gene in merged.genes:
+                if gene.id not in genes_before:
+                    gene.notes.setdefault("origin", origin)
+
+    merged._compartments.update(comp_names)
+    return merged
diff --git a/src/raven_python/manipulation/parameters.py b/src/raven_python/manipulation/parameters.py
new file mode 100644
index 0000000..f349804
--- /dev/null
+++ b/src/raven_python/manipulation/parameters.py
@@ -0,0 +1,78 @@
+"""Set reaction bounds to a sign-aware ±% variance band around measured values.
+
+Cobra has no idiom for the *variance band* case (e.g. "5 ± 20 %"); the other common
+bound-setting cases are cobra one-liners:
+
+* fixed lb / ub  → ``reaction.lower_bound`` / ``upper_bound`` / ``reaction.bounds``
+* equality       → ``reaction.bounds = (v, v)``
+* objective      → ``model.objective = {reaction: coeff}``
+* unconstrained  → ``reaction.bounds = cobra.Configuration().bounds``
+"""
+from __future__ import annotations
+
+from collections.abc import Iterable, Sequence
+
+import cobra
+from cobra import Reaction
+
+Number = int | float
+
+
+def _resolve(model: cobra.Model, reactions) -> list[Reaction]:
+    if isinstance(reactions, (str, Reaction)):
+        reactions = [reactions]
+    out: list[Reaction] = []
+    for r in reactions:
+        if isinstance(r, Reaction):
+            out.append(r)
+        elif r in model.reactions:
+            out.append(model.reactions.get_by_id(r))
+        else:
+            raise ValueError(f"Reaction {r!r} not found in the model.")
+    return out
+
+
+def _broadcast(value, n: int) -> list[float]:
+    if isinstance(value, (int, float)):
+        return [float(value)] * n
+    vals = [float(v) for v in value]
+    if len(vals) != n:
+        raise ValueError(
+            f"Expected 1 or {n} values to match the reactions, got {len(vals)}."
+        )
+    return vals
+
+
+def set_variance_bounds(
+    model: cobra.Model,
+    reactions: str | Reaction | Iterable,
+    values: Number | Sequence[Number],
+    percent: Number,
+) -> list[Reaction]:
+    """Constrain reactions to a ``±percent/2`` band around measured values.
+
+    For a measured value ``v`` and ``percent`` ``p``, the bounds become
+    ``v * (1 - p/200) .. v * (1 + p/200)`` — i.e. ``percent`` is the *total*
+    width, split half above and half below. For a negative ``v`` the two are
+    swapped so that ``lb <= ub``. E.g. ``percent=5`` gives 97.5 %..102.5 % of ``v``.
+
+    Parameters
+    ----------
+    reactions
+        Reaction IDs or objects.
+    values
+        Measured value per reaction; a scalar is broadcast to all reactions.
+    percent
+        Total band width as a percentage.
+
+    Returns
+    -------
+    list of cobra.Reaction
+        The reactions affected.
+    """
+    rxns = _resolve(model, reactions)
+    half = percent / 200.0
+    for rxn, v in zip(rxns, _broadcast(values, len(rxns)), strict=True):
+        lo, hi = v * (1 - half), v * (1 + half)
+        rxn.bounds = (hi, lo) if v < 0 else (lo, hi)
+    return rxns
diff --git a/src/raven_python/manipulation/remove.py b/src/raven_python/manipulation/remove.py
new file mode 100644
index 0000000..492de36
--- /dev/null
+++ b/src/raven_python/manipulation/remove.py
@@ -0,0 +1,120 @@
+"""Remove metabolites or genes from a model.
+
+For removing *reactions*, use cobra directly:
+``cobra.Model.remove_reactions(reactions, remove_orphans=...)``.
+
+The two functions here delegate the core to cobra and add the cobra-absent behaviour:
+
+* ``remove_metabolites`` — cobra matches metabolites by ID; RAVEN's ``isNames``
+  deletes a metabolite in **every compartment at once** by name. That name
+  resolution is the *sole* reason this wrapper exists (see the note on it).
+* ``remove_genes`` — cobra's ``cobra.manipulation.remove_genes`` already rewrites
+  GPRs through the boolean AST (removing one gene of ``A and B`` empties the
+  rule, of ``A or B`` keeps the other) — exactly RAVEN's intent, without its
+  ``eval``. The gap is RAVEN's default of **constraining** flux-blocked reactions
+  to zero instead of deleting them; exposed as ``blocked_reactions``.
+"""
+from __future__ import annotations
+
+from collections.abc import Iterable
+
+import cobra
+from cobra import Gene, Metabolite
+from cobra.manipulation import remove_genes as _cobra_remove_genes
+
+
+def _as_list(obj) -> list:
+    if isinstance(obj, (str, Metabolite, Gene)):
+        return [obj]
+    return list(obj)
+
+
+def remove_metabolites(
+    model: cobra.Model,
+    metabolites: str | Metabolite | Iterable,
+    *,
+    by_name: bool = False,
+    destructive: bool = False,
+) -> None:
+    """Remove metabolites, optionally matching by name across all compartments.
+
+    Parameters
+    ----------
+    by_name
+        If True, ``metabolites`` are metabolite *names*; every metabolite with a
+        matching name is removed, regardless of compartment (RAVEN ``isNames``).
+        If False, they are IDs/objects, resolved via cobra.
+    destructive
+        Passed to cobra: if True, also remove every reaction the metabolite
+        participates in.
+
+    Note
+    ----
+    With ``by_name=False`` this is just ``model.remove_metabolites`` — the
+    ``by_name`` cross-compartment deletion is the only thing this adds over cobra.
+    """
+    if by_name:
+        wanted = set(_as_list(metabolites))
+        targets = [m for m in model.metabolites if m.name in wanted]
+    else:
+        targets = model.metabolites.get_by_any(_as_list(metabolites))
+    if targets:
+        model.remove_metabolites(targets, destructive=destructive)
+
+
+def remove_genes(
+    model: cobra.Model,
+    genes: str | Gene | Iterable,
+    *,
+    blocked_reactions: str = "remove",
+    remove_orphans: bool = False,
+) -> list[str]:
+    """Remove genes and handle reactions left unable to carry flux.
+
+    GPR rewriting (with correct AND/OR semantics) and gene deletion are done by cobra;
+    this adds a policy for reactions whose GPR becomes empty (no enzyme left):
+
+    * ``"remove"`` — delete them (cobra's default).
+    * ``"constrain"`` — keep them but set bounds to ``(0, 0)``.
+    * ``"keep"`` — leave them with an empty GPR and unchanged bounds.
+
+    ``remove_orphans`` (only meaningful with ``blocked_reactions="remove"``)
+    passes through to cobra: drop metabolites *and* genes orphaned by the removal.
+
+    Returns
+    -------
+    list of str
+        IDs of the reactions that became flux-blocked (had a GPR, now empty).
+    """
+    if blocked_reactions not in ("remove", "constrain", "keep"):
+        raise ValueError(
+            f"blocked_reactions must be 'remove', 'constrain', or 'keep', "
+            f"got {blocked_reactions!r}"
+        )
+
+    # Resolve to gene IDs that are actually in the model (RAVEN filters likewise).
+    requested = [g.id if isinstance(g, Gene) else g for g in _as_list(genes)]
+    present = [gid for gid in requested if gid in model.genes]
+    if not present:
+        return []
+
+    # Reactions touched by these genes that currently have a GPR.
+    affected = set()
+    for gid in present:
+        affected.update(r.id for r in model.genes.get_by_id(gid).reactions)
+    had_gpr = {rid for rid in affected if model.reactions.get_by_id(rid).gene_reaction_rule}
+
+    # cobra rewrites GPRs (AST) and removes the gene objects; we manage reactions.
+    _cobra_remove_genes(model, present, remove_reactions=False)
+
+    blocked = [
+        rid for rid in had_gpr if not model.reactions.get_by_id(rid).gene_reaction_rule
+    ]
+
+    if blocked_reactions == "remove":
+        model.remove_reactions(blocked, remove_orphans=remove_orphans)
+    elif blocked_reactions == "constrain":
+        for rid in blocked:
+            model.reactions.get_by_id(rid).bounds = (0, 0)
+
+    return sorted(blocked)
diff --git a/src/raven_python/manipulation/simplify.py b/src/raven_python/manipulation/simplify.py
new file mode 100644
index 0000000..2deaccd
--- /dev/null
+++ b/src/raven_python/manipulation/simplify.py
@@ -0,0 +1,229 @@
+"""Reduce a model by removing/merging reactions that cannot carry flux.
+
+Four reduction modes that cobra does not cover out of the box:
+``remove_dead_end_reactions`` (reactions whose substrates have no producer),
+``remove_duplicate_reactions``, ``constrain_reversible_reactions`` (tighten bounds
+via FVA), and ``group_linear_reactions`` (lossy fold of unit-stoichiometry chains
+into one reaction; drops gene rules).
+
+Cobra-covered modes that you'd reach for separately:
+
+* No-flux removal → ``cobra.flux_analysis.find_blocked_reactions``.
+* Zero-interval removal → filter reactions with ``bounds == (0, 0)`` then prune.
+"""
+from __future__ import annotations
+
+import math
+from collections.abc import Iterable
+
+import cobra
+from cobra.flux_analysis import flux_variability_analysis
+
+from raven_python.manipulation.irreversible import convert_to_irreversible
+
+
+def _prune_orphan_metabolites(model: cobra.Model) -> list[str]:
+    orphans = [m for m in model.metabolites if not m.reactions]
+    if orphans:
+        model.remove_metabolites(orphans)
+    return [m.id for m in orphans]
+
+
+def _can_produce_and_consume(met) -> tuple[bool, bool]:
+    """Whether the network can both produce and consume ``met`` (given directions)."""
+    produce = consume = False
+    for rxn in met.reactions:
+        coef = rxn.get_coefficient(met)
+        if coef > 0:
+            produce |= rxn.upper_bound > 0
+            consume |= rxn.lower_bound < 0
+        elif coef < 0:
+            consume |= rxn.upper_bound > 0
+            produce |= rxn.lower_bound < 0
+    return produce, consume
+
+
+def remove_dead_end_reactions(
+    model: cobra.Model, *, reserved: Iterable[str] | None = None
+) -> tuple[list[str], list[str]]:
+    """Iteratively remove dead-end reactions and metabolites.
+
+    A metabolite
+    is a dead end if it participates in only one reaction, or if (accounting for
+    reaction directionality) it can only be produced or only consumed — such
+    metabolites cannot carry steady-state flux, so the reactions touching them
+    are removed. Repeats until stable.
+
+    Returns ``(removed_reaction_ids, removed_metabolite_ids)``.
+    """
+    reserved = set(reserved or [])
+    removed_rxns: list[str] = []
+    removed_mets: list[str] = []
+    while True:
+        removed_mets += _prune_orphan_metabolites(model)
+        dead = [
+            m
+            for m in model.metabolites
+            if len(m.reactions) <= 1 or not all(_can_produce_and_consume(m))
+        ]
+        if not dead:
+            break
+        rxns = {r for m in dead for r in m.reactions}
+        to_delete = [r for r in rxns if r.id not in reserved]
+        if not to_delete:
+            break
+        removed_rxns += [r.id for r in to_delete]
+        model.remove_reactions(to_delete)
+    return removed_rxns, removed_mets
+
+
+def _signature(rxn):
+    mets = frozenset((m.id, c) for m, c in rxn.metabolites.items())
+    return (mets, rxn.lower_bound, rxn.upper_bound, rxn.objective_coefficient)
+
+
+def remove_duplicate_reactions(
+    model: cobra.Model, *, reserved: Iterable[str] | None = None
+) -> list[str]:
+    """Remove all-but-one of each set of duplicate reactions.
+
+    Reactions are duplicates when they have identical stoichiometry, bounds, and
+    objective coefficient. One of each set is kept (reserved reactions are never
+    removed). Returns the removed reaction IDs.
+    """
+    reserved = set(reserved or [])
+    groups: dict = {}
+    for rxn in model.reactions:
+        groups.setdefault(_signature(rxn), []).append(rxn)
+
+    removed: list[str] = []
+    for rxns in groups.values():
+        if len(rxns) <= 1:
+            continue
+        keep = rxns[-1]
+        to_remove = [r for r in rxns if r is not keep and r.id not in reserved]
+        if to_remove:
+            removed += [r.id for r in to_remove]
+            model.remove_reactions(to_remove)
+    return removed
+
+
+def constrain_reversible_reactions(
+    model: cobra.Model, *, eps: float = 1e-9
+) -> list[str]:
+    """Constrain reversible reactions that can only carry flux one way.
+
+    Runs FVA on
+    each reversible reaction; if it can only carry forward flux its lower bound
+    is set to 0, and if it can only carry reverse flux it is flipped to a forward
+    reaction (stoichiometry, bounds, and objective negated). Returns the changed
+    reaction IDs.
+    """
+    revs = [r for r in model.reactions if r.lower_bound < 0 < r.upper_bound]
+    if not revs:
+        return []
+    # Infeasible models surface as either OptimizationError (Gurobi/HiGHS) or
+    # NaN-filled ranges (some optlang backends silently). Catch both and raise
+    # a single clear error — the original ``abs(NaN) < eps`` comparison would
+    # have silently no-op'd, letting bogus "all reactions truly reversible"
+    # decisions sneak through.
+    try:
+        fva = flux_variability_analysis(
+            model, reaction_list=revs, fraction_of_optimum=0.0
+        )
+    except Exception as exc:  # noqa: BLE001 - solver-family agnostic
+        raise RuntimeError(
+            "constrain_reversible_reactions: FVA failed — the model is likely "
+            "infeasible at fraction_of_optimum=0. Fix the infeasibility first "
+            "(often a missing exchange or an over-constrained essential). "
+            f"({exc})"
+        ) from exc
+    if fva[["minimum", "maximum"]].isna().any().any():
+        raise RuntimeError(
+            "constrain_reversible_reactions: FVA returned NaN ranges — the "
+            "model is infeasible at fraction_of_optimum=0. Fix the infeasibility "
+            "first (often a missing exchange or an over-constrained essential)."
+        )
+
+    changed: list[str] = []
+    for rxn in revs:
+        lo = fva.at[rxn.id, "minimum"]
+        hi = fva.at[rxn.id, "maximum"]
+        # Guard against ±inf ranges (unbounded objective): treat them as truly
+        # reversible rather than "zero" by the abs(·) < eps check.
+        if math.isinf(lo) or math.isinf(hi):
+            continue
+        min_zero, max_zero = abs(lo) < eps, abs(hi) < eps
+        if min_zero == max_zero:  # both ~0 (blocked) or both nonzero (truly reversible)
+            continue
+        if max_zero:  # only reverse flux → flip to a forward reaction
+            old_lb = rxn.lower_bound
+            rxn.add_metabolites({m: -2 * c for m, c in rxn.metabolites.items()})
+            rxn.bounds = (0.0, -old_lb)
+            rxn.objective_coefficient = -rxn.objective_coefficient
+        else:  # only forward flux
+            rxn.lower_bound = 0.0
+        changed.append(rxn.id)
+    return changed
+
+
+def group_linear_reactions(
+    model: cobra.Model, *, reserved: Iterable[str] | None = None
+) -> None:
+    """Merge linear (single-producer, single-consumer) reaction chains.
+
+    **Lossy**: gene-reaction
+    associations are discarded (RAVEN does the same), since merged reactions have
+    no meaningful combined GPR. The model is first made irreversible, then any
+    metabolite that is produced by exactly one reaction and consumed by exactly
+    one reaction is eliminated by merging the two reactions. Mutates in place.
+    """
+    reserved = set(reserved or [])
+
+    # Lossy: drop all gene information.
+    for rxn in model.reactions:
+        rxn.gene_reaction_rule = ""
+    for gene in list(model.genes):
+        model.genes.remove(gene)
+
+    convert_to_irreversible(model)
+
+    # Worklist of metabolites to (re)consider for merging. Each metabolite
+    # participating in a merge can expose new linear chains in its neighbours,
+    # so we re-enqueue the touched mets rather than restart the whole scan
+    # (the old O(n²·m) restart-after-every-merge loop).
+    pending: list = list(model.metabolites)
+    seen_in_pass: set = set()
+    while pending:
+        met = pending.pop()
+        if met not in model.metabolites:  # removed in a previous merge
+            continue
+        rxns = list(met.reactions)
+        if len(rxns) != 2 or any(r.id in reserved for r in rxns):
+            continue
+        r1, r2 = rxns
+        c1, c2 = r1.get_coefficient(met), r2.get_coefficient(met)
+        if (c1 > 0) == (c2 > 0):  # need one producer and one consumer
+            continue
+        ratio = abs(c1 / c2)
+        new_lb = max(r1.lower_bound, r2.lower_bound / ratio)
+        new_ub = min(r1.upper_bound, r2.upper_bound / ratio)
+        new_obj = r1.objective_coefficient + r2.objective_coefficient * ratio
+        # Re-enqueue every metabolite touched by either side — the merge can
+        # turn neighbours into single-producer/consumer chains in turn.
+        touched = {m for m in r1.metabolites} | {m for m in r2.metabolites}
+        # Merge r2*ratio into r1; the shared metabolite cancels and is dropped.
+        r1.add_metabolites({m: c * ratio for m, c in r2.metabolites.items()})
+        model.remove_reactions([r2])
+        r1.bounds = (new_lb, new_ub)
+        r1.objective_coefficient = new_obj
+        seen_in_pass.clear()
+        for m in touched:
+            if m in model.metabolites and id(m) not in seen_in_pass:
+                seen_in_pass.add(id(m))
+                pending.append(m)
+    # One terminal cleanup pass (cheap; only what remains).
+    empty = [r for r in model.reactions if not r.metabolites]
+    if empty:
+        model.remove_reactions(empty)
+    _prune_orphan_metabolites(model)
diff --git a/src/raven_python/manipulation/transfer.py b/src/raven_python/manipulation/transfer.py
new file mode 100644
index 0000000..b867f02
--- /dev/null
+++ b/src/raven_python/manipulation/transfer.py
@@ -0,0 +1,144 @@
+"""Copy reactions (with their metabolites and genes) from another model.
+
+cobra's ``Model.merge`` / ``add_reactions`` match metabolites strictly by id. This
+transfers a chosen set of reactions from a *source* model into a draft, matching
+metabolites by **name[compartment]** instead — so a compound present in both models
+under different ids is reused rather than duplicated, and only genuinely new
+metabolites are created (copying the source's id, formula,
+charge, and annotation). New genes are auto-created by cobra when the GPR is set.
+This is the post-``getModelFromHomology`` "copy a few more reactions across"
+workflow.
+"""
+from __future__ import annotations
+
+import copy
+from collections.abc import Iterable
+
+import cobra
+from cobra import Metabolite, Reaction
+
+from raven_python.manipulation.add import _new_met_id
+
+
+def _name_comp(met: Metabolite) -> str:
+    return f"{met.name}[{met.compartment}]"
+
+
+def add_reactions_from_model(
+    model: cobra.Model,
+    source_model: cobra.Model,
+    reactions: str | Iterable[str],
+    *,
+    genes: bool | str | Iterable[str] = False,
+    note: str | None = "Added via add_reactions_from_model()",
+    confidence: int | None = None,
+) -> list[Reaction]:
+    """Copy reactions from ``source_model`` into ``model``.
+    Parameters
+    ----------
+    model
+        Draft model to copy into (mutated in place).
+    source_model
+        Model to copy reactions from.
+    reactions
+        Reaction ID(s) in ``source_model``. Reactions already present in
+        ``model`` (by ID) are skipped.
+    genes
+        ``False`` (default): add reactions without GPRs. ``True``: copy each
+        reaction's GPR from the source. A string: use it as the GPR for every
+        added reaction. A list: per-reaction GPRs (matching the reactions that
+        are actually added). New genes are created automatically.
+    note
+        Stored in each added reaction's ``notes['note']`` (set ``None`` to skip).
+    confidence
+        If given, stored in each added reaction's ``notes['confidence_score']``.
+
+    Returns
+    -------
+    list of cobra.Reaction
+        The reactions added, in input order.
+    """
+    rxn_ids = [reactions] if isinstance(reactions, str) else list(reactions)
+    missing = [r for r in rxn_ids if r not in source_model.reactions]
+    if missing:
+        raise ValueError(f"Reactions not found in the source model: {missing}")
+
+    new_ids = [r for r in rxn_ids if r not in model.reactions]
+    if not new_ids:
+        raise ValueError("All reactions are already in the model.")
+    source_rxns = [source_model.reactions.get_by_id(r) for r in new_ids]
+
+    if genes is False:
+        rules = [""] * len(source_rxns)
+    elif genes is True:
+        rules = [r.gene_reaction_rule for r in source_rxns]
+    elif isinstance(genes, str):
+        rules = [genes] * len(source_rxns)
+    else:
+        rules = list(genes)
+        if len(rules) != len(source_rxns):
+            raise ValueError(
+                f"genes list has {len(rules)} rules but {len(source_rxns)} "
+                "reactions are being added."
+            )
+
+    # Match metabolites by name[comp]; create only the genuinely new ones.
+    draft_by_name = {_name_comp(m): m for m in model.metabolites}
+    new_mets: list[Metabolite] = []
+    pending: set[str] = set()
+    # Track ids minted within this batch so two source mets that share an id
+    # but differ in name[comp] don't collide when add_metabolites runs.
+    pending_ids: set[str] = set()
+    for srx in source_rxns:
+        for met in srx.metabolites:
+            key = _name_comp(met)
+            if key in draft_by_name or key in pending:
+                continue
+            pending.add(key)
+            if met.id not in model.metabolites and met.id not in pending_ids:
+                new_id = met.id
+            else:
+                # _new_met_id only knows the model; loop past in-batch hits too.
+                new_id = _new_met_id(model, "m")
+                while new_id in pending_ids:
+                    n = int(new_id[1:]) + 1
+                    new_id = f"m{n}"
+                    while new_id in model.metabolites:
+                        n += 1
+                        new_id = f"m{n}"
+            pending_ids.add(new_id)
+            new_met = Metabolite(
+                new_id,
+                name=met.name,
+                compartment=met.compartment,
+                formula=met.formula,
+                charge=met.charge,
+            )
+            new_met.annotation = copy.deepcopy(met.annotation)
+            new_met.notes = copy.deepcopy(met.notes)
+            new_mets.append(new_met)
+            draft_by_name[key] = new_met
+    if new_mets:
+        model.add_metabolites(new_mets)
+
+    added: list[Reaction] = []
+    for srx, rule in zip(source_rxns, rules, strict=True):
+        rxn = Reaction(srx.id, name=srx.name)
+        rxn.bounds = srx.bounds
+        rxn.subsystem = srx.subsystem
+        model.add_reactions([rxn])
+        rxn.add_metabolites(
+            {draft_by_name[_name_comp(met)]: coef for met, coef in srx.metabolites.items()}
+        )
+        if rule:
+            rxn.gene_reaction_rule = rule
+        rxn.annotation = copy.deepcopy(srx.annotation)
+        notes = copy.deepcopy(srx.notes)
+        if note is not None:
+            notes["note"] = note
+        if confidence is not None:
+            notes["confidence_score"] = confidence
+        rxn.notes = notes
+        added.append(rxn)
+
+    return added
diff --git a/src/raven_python/manipulation/transport.py b/src/raven_python/manipulation/transport.py
new file mode 100644
index 0000000..d0c1bf1
--- /dev/null
+++ b/src/raven_python/manipulation/transport.py
@@ -0,0 +1,157 @@
+"""Add transport reactions between compartments.
+
+cobra has no transport-reaction primitive. For each metabolite this matches the
+species by *name* across compartments (the source in ``from_compartment`` and its
+same-named twin in each target compartment), optionally creating the target
+metabolite, and
+builds a ``-1 from / +1 to`` reaction with a sequential ``tr_0001`` ID.
+"""
+from __future__ import annotations
+
+import re
+import warnings
+from collections.abc import Iterable
+
+import cobra
+from cobra import Metabolite, Reaction
+
+from raven_python.manipulation.add import _new_met_id
+
+
+def _index_by_name(mets: Iterable[Metabolite], compartment: str) -> dict[str, Metabolite]:
+    """Index metabolites by name, warning when a name is duplicated.
+
+    Same-name duplicates in a single compartment are unusual but legal in cobra,
+    and the previous one-pass dict comprehension silently dropped all but one.
+    """
+    out: dict[str, list[Metabolite]] = {}
+    for m in mets:
+        out.setdefault(m.name, []).append(m)
+    chosen: dict[str, Metabolite] = {}
+    for name, group in out.items():
+        if len(group) > 1:
+            warnings.warn(
+                f"Multiple metabolites named {name!r} in compartment {compartment!r} "
+                f"({[m.id for m in group]}); using {group[0].id!r} for transport.",
+                stacklevel=3,
+            )
+        chosen[name] = group[0]
+    return chosen
+
+
+def _transport_id_factory(model: cobra.Model, prefix: str):
+    pattern = re.compile(rf"^{re.escape(prefix)}(\d+)$")
+    used = [int(m.group(1)) for r in model.reactions if (m := pattern.match(r.id))]
+    counter = max(used) + 1 if used else 1
+
+    def next_id() -> str:
+        nonlocal counter
+        while f"{prefix}{counter:04d}" in model.reactions:
+            counter += 1
+        rid = f"{prefix}{counter:04d}"
+        counter += 1
+        return rid
+
+    return next_id
+
+
+def add_transport_reactions(
+    model: cobra.Model,
+    from_compartment: str,
+    to_compartments: str | Iterable[str],
+    metabolite_names: str | Iterable[str] | None = None,
+    *,
+    reversible: bool = True,
+    only_to_existing: bool = True,
+    id_prefix: str = "tr_",
+) -> list[Reaction]:
+    """Add transport reactions from one compartment to one or more others.
+    Parameters
+    ----------
+    from_compartment
+        Source compartment id.
+    to_compartments
+        Target compartment id(s).
+    metabolite_names
+        Names of metabolites to transport. Default: every metabolite in
+        ``from_compartment``.
+    reversible
+        If True (default), bounds span the cobra configuration default
+        (reversible); otherwise lower bound 0.
+    only_to_existing
+        If True (default), only transport a metabolite into a target
+        compartment where a same-named metabolite already exists. If False,
+        create the missing target metabolite (copying name/formula/charge/
+        annotation from the source) before adding the transport.
+    id_prefix
+        Prefix for the sequential reaction IDs (``tr_0001``, ...).
+
+    Returns
+    -------
+    list of cobra.Reaction
+        The transport reactions added, in creation order.
+    """
+    # cobra's `model.compartments` only lists compartments that have metabolites;
+    # include registered-but-empty ones so transport can target an empty compartment.
+    known = set(model.compartments) | set(model._compartments)
+    if from_compartment not in known:
+        raise ValueError(f"Compartment {from_compartment!r} is not in the model.")
+    if isinstance(to_compartments, str):
+        to_compartments = [to_compartments]
+    else:
+        to_compartments = list(to_compartments)
+    for comp in to_compartments:
+        if comp not in known:
+            raise ValueError(f"Compartment {comp!r} is not in the model.")
+
+    source = _index_by_name(
+        (m for m in model.metabolites if m.compartment == from_compartment),
+        from_compartment,
+    )
+    if metabolite_names is None:
+        names = list(source)
+    else:
+        names = [metabolite_names] if isinstance(metabolite_names, str) else list(metabolite_names)
+        missing = [n for n in names if n not in source]
+        if missing:
+            raise ValueError(
+                f"Metabolites not found in compartment {from_compartment!r}: {missing}"
+            )
+
+    cfg = cobra.Configuration()
+    bounds = (cfg.lower_bound, cfg.upper_bound) if reversible else (0.0, cfg.upper_bound)
+    from_name = model.compartments.get(from_compartment) or from_compartment
+    next_id = _transport_id_factory(model, id_prefix)
+
+    added: list[Reaction] = []
+    for to_comp in to_compartments:
+        to_name = model.compartments.get(to_comp) or to_comp
+        targets = _index_by_name(
+            (m for m in model.metabolites if m.compartment == to_comp),
+            to_comp,
+        )
+        for name in names:
+            src = source[name]
+            dst = targets.get(name)
+            if dst is None:
+                if only_to_existing:
+                    continue
+                dst = Metabolite(
+                    _new_met_id(model, "m"),
+                    name=name,
+                    compartment=to_comp,
+                    formula=src.formula,
+                    charge=src.charge,
+                )
+                dst.annotation = dict(src.annotation)
+                model.add_metabolites([dst])
+                targets[name] = dst
+
+            rxn = Reaction(next_id())
+            rxn.name = f"{name} transport, {from_name}-{to_name}"
+            rxn.bounds = bounds
+            model.add_reactions([rxn])
+            rxn.add_metabolites({src: -1, dst: 1})
+            added.append(rxn)
+
+    return added
diff --git a/tests/test_change_grrules.py b/tests/test_change_grrules.py
new file mode 100644
index 0000000..d33f723
--- /dev/null
+++ b/tests/test_change_grrules.py
@@ -0,0 +1,49 @@
+"""Tests for change_gene_reaction_rules (changeGrRules port)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import add_reactions_from_equations, change_gene_reaction_rules
+
+
+@pytest.fixture
+def model():
+    m = cobra.Model("t")
+    m.add_metabolites(
+        [cobra.Metabolite("a_c", compartment="c"), cobra.Metabolite("b_c", compartment="c")]
+    )
+    add_reactions_from_equations(
+        m,
+        [
+            {"id": "R1", "equation": "a_c --> b_c", "gene_reaction_rule": "G1"},
+            {"id": "R2", "equation": "a_c --> b_c"},
+        ],
+    )
+    return m
+
+
+def test_replace_rule_and_create_genes(model):
+    (rxn,) = change_gene_reaction_rules(model, {"R1": "G2 and G3"})
+    assert rxn.gene_reaction_rule == "G2 and G3"
+    assert {g.id for g in rxn.genes} == {"G2", "G3"}
+    assert {"G2", "G3"} <= {g.id for g in model.genes}
+
+
+def test_append_rule(model):
+    change_gene_reaction_rules(model, {"R1": "G4"}, replace=False)
+    # (G1) or (G4), normalised by cobra
+    assert model.reactions.get_by_id("R1").gene_reaction_rule == "G1 or G4"
+
+
+def test_append_when_empty_is_just_new(model):
+    change_gene_reaction_rules(model, {"R2": "G5"}, replace=False)
+    assert model.reactions.get_by_id("R2").gene_reaction_rule == "G5"
+
+
+def test_batch(model):
+    changed = change_gene_reaction_rules(model, {"R1": "GA", "R2": "GB"})
+    assert [r.id for r in changed] == ["R1", "R2"]
+
+
+def test_unknown_reaction_errors(model):
+    with pytest.raises(ValueError, match="not found"):
+        change_gene_reaction_rules(model, {"NOPE": "G1"})
diff --git a/tests/test_manipulation_add.py b/tests/test_manipulation_add.py
new file mode 100644
index 0000000..2a3a9d3
--- /dev/null
+++ b/tests/test_manipulation_add.py
@@ -0,0 +1,278 @@
+"""Tests for raven_python.manipulation.add (addRxns port)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import add_reactions_from_equations
+from raven_python.utils.parse import parse_name_comp
+
+
+@pytest.fixture
+def model():
+    m = cobra.Model("t")
+    m.add_metabolites(
+        [
+            cobra.Metabolite("atp_c", name="ATP", compartment="c"),
+            cobra.Metabolite("h2o_c", name="H2O", compartment="c"),
+            cobra.Metabolite("adp_c", name="ADP", compartment="c"),
+            cobra.Metabolite("pi_c", name="phosphate", compartment="c"),
+        ]
+    )
+    return m
+
+
+# --- parse_name_comp -------------------------------------------------------
+
+@pytest.mark.parametrize(
+    "token,expected",
+    [
+        ("ATP[c]", ("ATP", "c")),
+        ("ATP", ("ATP", None)),
+        ("  ATP[c] ", ("ATP", "c")),
+        ("weird[name][m]", ("weird[name]", "m")),
+    ],
+)
+def test_parse_name_comp(token, expected):
+    assert parse_name_comp(token) == expected
+
+
+# --- id mode (eqnType 1) ---------------------------------------------------
+
+def test_add_by_id_basic_and_reversibility(model):
+    (rxn,) = add_reactions_from_equations(
+        model, [{"id": "R1", "equation": "atp_c + h2o_c <=> adp_c + pi_c"}]
+    )
+    assert rxn.id == "R1"
+    assert rxn.reversibility is True
+    assert {m.id: rxn.get_coefficient(m.id) for m in rxn.metabolites} == {
+        "atp_c": -1.0,
+        "h2o_c": -1.0,
+        "adp_c": 1.0,
+        "pi_c": 1.0,
+    }
+
+
+def test_irreversible_arrows(model):
+    rxns = add_reactions_from_equations(
+        model,
+        [
+            {"id": "R1", "equation": "atp_c --> adp_c"},
+            {"id": "R2", "equation": "atp_c => adp_c"},
+        ],
+    )
+    for r in rxns:
+        assert r.lower_bound == 0.0
+        assert r.reversibility is False
+
+
+def test_coefficients(model):
+    (rxn,) = add_reactions_from_equations(
+        model, [{"id": "R1", "equation": "2 atp_c + 1.5 h2o_c --> adp_c"}]
+    )
+    assert rxn.get_coefficient("atp_c") == -2.0
+    assert rxn.get_coefficient("h2o_c") == -1.5
+
+
+def test_id_mode_creates_new_met_in_compartment(model):
+    add_reactions_from_equations(
+        model,
+        [{"id": "R1", "equation": "atp_c --> amp_c"}],
+        compartment="c",
+    )
+    assert "amp_c" in model.metabolites
+    assert model.metabolites.get_by_id("amp_c").compartment == "c"
+
+
+def test_id_mode_new_met_without_compartment_errors(model):
+    with pytest.raises(ValueError, match="no compartment"):
+        add_reactions_from_equations(model, [{"id": "R1", "equation": "atp_c --> amp_c"}])
+
+
+# --- name mode (eqnType 2) -------------------------------------------------
+
+def test_name_mode_matches_existing_by_name(model):
+    (rxn,) = add_reactions_from_equations(
+        model,
+        [{"id": "R1", "equation": "ATP + H2O <=> ADP + phosphate"}],
+        mets_by="name",
+        compartment="c",
+    )
+    # resolved to the existing _c metabolites, not new ones
+    assert {m.id for m in rxn.metabolites} == {"atp_c", "h2o_c", "adp_c", "pi_c"}
+    assert len(model.metabolites) == 4
+
+
+def test_name_mode_creates_new_met_with_auto_id(model):
+    add_reactions_from_equations(
+        model,
+        [{"id": "R1", "equation": "ATP --> AMP"}],
+        mets_by="name",
+        compartment="c",
+    )
+    new = [m for m in model.metabolites if m.name == "AMP"]
+    assert len(new) == 1
+    assert new[0].id == "m1"
+    assert new[0].compartment == "c"
+
+
+def test_name_mode_requires_compartment(model):
+    with pytest.raises(ValueError, match="needs a compartment"):
+        add_reactions_from_equations(
+            model, [{"id": "R1", "equation": "ATP --> ADP"}], mets_by="name"
+        )
+
+
+# --- name[comp] mode (eqnType 3) -------------------------------------------
+
+def test_name_comp_syntax(model):
+    model.add_metabolites([cobra.Metabolite("atp_m", name="ATP", compartment="m")])
+    (rxn,) = add_reactions_from_equations(
+        model,
+        [{"id": "R1", "equation": "ATP[c] --> ATP[m]"}],
+        mets_by="name",
+        compartment="c",
+    )
+    # matched ATP in two different compartments by name[comp]
+    assert {m.id for m in rxn.metabolites} == {"atp_c", "atp_m"}
+
+
+# --- genes -----------------------------------------------------------------
+
+def test_gene_rule_auto_creates_genes(model):
+    (rxn,) = add_reactions_from_equations(
+        model,
+        [{"id": "R1", "equation": "atp_c --> adp_c", "gene_reaction_rule": "G1 and G2"}],
+    )
+    assert {g.id for g in rxn.genes} == {"G1", "G2"}
+    assert {g.id for g in model.genes} == {"G1", "G2"}
+
+
+def test_strict_genes_errors_on_unknown(model):
+    with pytest.raises(ValueError, match="genes not in the model"):
+        add_reactions_from_equations(
+            model,
+            [{"id": "R1", "equation": "atp_c --> adp_c", "gene_reaction_rule": "G1"}],
+            allow_new_genes=False,
+        )
+
+
+def test_strict_genes_ok_when_present(model):
+    model.genes.append(cobra.core.gene.Gene("G1"))
+    (rxn,) = add_reactions_from_equations(
+        model,
+        [{"id": "R1", "equation": "atp_c --> adp_c", "gene_reaction_rule": "G1"}],
+        allow_new_genes=False,
+    )
+    assert rxn.gene_reaction_rule == "G1"
+
+
+# --- guards & extras -------------------------------------------------------
+
+def test_duplicate_reaction_id_errors(model):
+    model.add_reactions([cobra.Reaction("R1")])
+    with pytest.raises(ValueError, match="already exists"):
+        add_reactions_from_equations(model, [{"id": "R1", "equation": "atp_c --> adp_c"}])
+
+
+def test_strict_mets_errors(model):
+    with pytest.raises(ValueError, match="allow_new_mets"):
+        add_reactions_from_equations(
+            model,
+            [{"id": "R1", "equation": "atp_c --> amp_c"}],
+            compartment="c",
+            allow_new_mets=False,
+        )
+
+
+def test_explicit_bounds_override_arrow(model):
+    (rxn,) = add_reactions_from_equations(
+        model,
+        [{"id": "R1", "equation": "atp_c <=> adp_c", "bounds": (0, 50), "name": "myrxn"}],
+    )
+    assert rxn.bounds == (0, 50)
+    assert rxn.name == "myrxn"
+
+
+def test_net_zero_metabolite_dropped(model):
+    # atp_c on both sides nets to zero and is removed.
+    (rxn,) = add_reactions_from_equations(
+        model, [{"id": "R1", "equation": "atp_c + h2o_c --> atp_c + adp_c"}]
+    )
+    assert "atp_c" not in {m.id for m in rxn.metabolites}
+    assert {m.id for m in rxn.metabolites} == {"h2o_c", "adp_c"}
+
+
+def test_missing_equation_errors(model):
+    with pytest.raises(ValueError, match="missing required 'equation'"):
+        add_reactions_from_equations(model, [{"id": "R1"}])
+
+
+def test_no_arrow_errors(model):
+    with pytest.raises(ValueError, match="No reaction arrow"):
+        add_reactions_from_equations(model, [{"id": "R1", "equation": "atp_c + h2o_c"}])
+
+
+# --- regression: leading-number metabolite name (known_issues.md A1) -------
+
+def test_name_mode_preserves_leading_number_name(model):
+    """A metabolite name that begins with a number isn't misparsed as a coefficient.
+
+    Before the fix the token ``"2 oxoglutarate"`` was parsed as ``(coeff=2, name="oxoglutarate")``
+    silently — corrupting the stoichiometry. The resolver now prefers the full
+    token when it matches an existing metabolite name.
+    """
+    model.add_metabolites([
+        cobra.Metabolite("akg_c", name="2 oxoglutarate", compartment="c"),
+    ])
+    (rxn,) = add_reactions_from_equations(
+        model,
+        [{"id": "R1", "equation": "ATP + 2 oxoglutarate --> ADP"}],
+        mets_by="name",
+        compartment="c",
+    )
+    assert rxn.get_coefficient("akg_c") == -1.0  # not -2.0
+    assert rxn.get_coefficient("atp_c") == -1.0
+
+
+def test_name_mode_coefficient_still_works_without_collision(model):
+    """If the full token doesn't match anything, fall back to coefficient split."""
+    (rxn,) = add_reactions_from_equations(
+        model,
+        [{"id": "R1", "equation": "2 ATP + H2O --> ADP + phosphate"}],
+        mets_by="name",
+        compartment="c",
+    )
+    assert rxn.get_coefficient("atp_c") == -2.0
+
+
+# --- regression: empty-stoichiometry warning (known_issues.md A2) ----------
+
+def test_empty_stoichiometry_warns(model):
+    """All-terms-cancel reaction warns instead of silently shipping an empty rxn."""
+    with pytest.warns(UserWarning, match="no net metabolites"):
+        (rxn,) = add_reactions_from_equations(
+            model, [{"id": "R1", "equation": "atp_c --> atp_c"}]
+        )
+    assert len(rxn.metabolites) == 0
+
+
+# --- regression: unknown-compartment warning (known_issues.md B2) ----------
+
+def test_id_mode_unknown_compartment_warns(model):
+    """A typo'd compartment used to silently produce a one-met ghost compartment
+    in id mode (the name/[comp] path used to validate, id mode never did)."""
+    with pytest.warns(UserWarning, match="unregistered compartment 'cyto'"):
+        add_reactions_from_equations(
+            model,
+            [{"id": "R1", "equation": "atp_c --> amp_c"}],
+            compartment="cyto",  # typo for 'c'
+        )
+
+
+def test_name_comp_unknown_compartment_warns(model):
+    """Same defensive check in the name[comp] path when allow_new_mets=True."""
+    with pytest.warns(UserWarning, match="unregistered compartment 'mito'"):
+        add_reactions_from_equations(
+            model,
+            [{"id": "R1", "equation": "ATP[c] --> AMP[mito]"}],
+            mets_by="name",
+        )
diff --git a/tests/test_manipulation_change.py b/tests/test_manipulation_change.py
new file mode 100644
index 0000000..8d54f58
--- /dev/null
+++ b/tests/test_manipulation_change.py
@@ -0,0 +1,93 @@
+"""Tests for raven_python.manipulation.change (changeRxns port)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import add_reactions_from_equations, change_reaction_equations
+
+
+@pytest.fixture
+def model():
+    m = cobra.Model("t")
+    m.add_metabolites(
+        [
+            cobra.Metabolite("a_c", name="A", compartment="c"),
+            cobra.Metabolite("b_c", name="B", compartment="c"),
+            cobra.Metabolite("c_c", name="C", compartment="c"),
+        ]
+    )
+    add_reactions_from_equations(
+        m,
+        [
+            {
+                "id": "R1",
+                "equation": "a_c <=> b_c",
+                "name": "first",
+                "bounds": (-30, 70),
+                "gene_reaction_rule": "G1 or G2",
+                "subsystem": "sub",
+            },
+            {"id": "R2", "equation": "a_c --> c_c"},
+        ],
+    )
+    return m
+
+
+def test_changes_stoichiometry(model):
+    (rxn,) = change_reaction_equations(model, {"R1": "a_c --> 2 c_c"})
+    assert rxn.id == "R1"
+    assert {m.id: rxn.get_coefficient(m.id) for m in rxn.metabolites} == {
+        "a_c": -1.0,
+        "c_c": 2.0,
+    }
+
+
+def test_preserves_other_fields(model):
+    before = model.reactions.get_by_id("R1")
+    name, bounds, subsystem = before.name, before.bounds, before.subsystem
+    genes = {g.id for g in before.genes}
+
+    change_reaction_equations(model, {"R1": "a_c --> c_c"})
+
+    after = model.reactions.get_by_id("R1")
+    assert after.name == name
+    assert after.bounds == bounds  # bounds untouched, per RAVEN
+    assert after.subsystem == subsystem
+    assert {g.id for g in after.genes} == genes
+
+
+def test_preserves_reaction_order(model):
+    order_before = [r.id for r in model.reactions]
+    change_reaction_equations(model, {"R1": "b_c --> c_c"})
+    assert [r.id for r in model.reactions] == order_before
+
+
+def test_bounds_not_changed_by_arrow(model):
+    # R1 starts reversible (-30, 70); a --> arrow must NOT make it irreversible.
+    change_reaction_equations(model, {"R1": "a_c --> b_c"})
+    assert model.reactions.get_by_id("R1").bounds == (-30, 70)
+
+
+def test_name_mode(model):
+    (rxn,) = change_reaction_equations(
+        model, {"R2": "A --> C"}, mets_by="name", compartment="c"
+    )
+    assert {m.id for m in rxn.metabolites} == {"a_c", "c_c"}
+
+
+def test_can_introduce_new_met(model):
+    change_reaction_equations(
+        model, {"R2": "a_c --> d_c"}, compartment="c"
+    )
+    assert "d_c" in model.metabolites
+    assert model.reactions.get_by_id("R2").get_coefficient("d_c") == 1.0
+
+
+def test_unknown_reaction_errors(model):
+    with pytest.raises(ValueError, match="not found"):
+        change_reaction_equations(model, {"NOPE": "a_c --> b_c"})
+
+
+def test_multiple_reactions(model):
+    changed = change_reaction_equations(model, {"R1": "a_c --> c_c", "R2": "b_c --> c_c"})
+    assert [r.id for r in changed] == ["R1", "R2"]
+    assert model.reactions.get_by_id("R2").get_coefficient("b_c") == -1.0
diff --git a/tests/test_manipulation_compartments.py b/tests/test_manipulation_compartments.py
new file mode 100644
index 0000000..4d3fb3b
--- /dev/null
+++ b/tests/test_manipulation_compartments.py
@@ -0,0 +1,139 @@
+"""Tests for manipulation/compartments.py — merge_compartments + copy_to_compartment."""
+from __future__ import annotations
+
+import cobra
+import pytest
+
+from raven_python.manipulation.compartments import copy_to_compartment, merge_compartments
+
+
+def _two_compartment_model() -> cobra.Model:
+    """A_c → B_c, A_m → B_m, and a transport A_c ↔ A_m. Multi-compartment toy."""
+    m = cobra.Model("toy")
+    A_c = cobra.Metabolite("A_c", name="A", compartment="c")
+    A_m = cobra.Metabolite("A_m", name="A", compartment="m")
+    B_c = cobra.Metabolite("B_c", name="B", compartment="c")
+    B_m = cobra.Metabolite("B_m", name="B", compartment="m")
+    m.add_metabolites([A_c, A_m, B_c, B_m])
+
+    def rxn(rid, lb, ub, mets, gpr=None):
+        r = cobra.Reaction(rid, lower_bound=lb, upper_bound=ub)
+        r.add_metabolites(mets)
+        if gpr:
+            r.gene_reaction_rule = gpr
+        return r
+    m.add_reactions([rxn("r_c", 0, 1000, {A_c: -1, B_c: 1}, "g1"),
+                     rxn("r_m", 0, 1000, {A_m: -1, B_m: 1}, "g2"),
+                     rxn("tr_A", -1000, 1000, {A_c: -1, A_m: 1})])
+    return m
+
+
+# ----------------------------------------------------------------- merge_compartments
+
+def test_merge_compartments_collapses_to_one():
+    """A_c + A_m → A; B_c + B_m → B; transport A_c↔A_m self-cancels and is dropped."""
+    m = _two_compartment_model()
+    merged, deleted, dupes = merge_compartments(m)
+    # Only the base ids survive.
+    assert {x.id for x in merged.metabolites} == {"A", "B"}
+    # The transport reaction collapsed (A → A) and was deleted.
+    assert "tr_A" in deleted
+    # r_c and r_m are now both A → B; one of them gets deduplicated.
+    surviving = {r.id for r in merged.reactions}
+    assert len(surviving & {"r_c", "r_m"}) == 1
+    assert (set(dupes) | (surviving & {"r_c", "r_m"})) == {"r_c", "r_m"}
+
+
+def test_merge_compartments_preserves_gpr_and_subsystem():
+    m = _two_compartment_model()
+    m.reactions.r_c.subsystem = "carbo"
+    merged, _, _ = merge_compartments(m)
+    survivor = next(r for r in merged.reactions if r.id in {"r_c", "r_m"})
+    # The survivor keeps its gene rule + subsystem (cobra may sometimes lose them
+    # through copy; we set them explicitly).
+    assert survivor.gene_reaction_rule in {"g1", "g2"}
+    if survivor.id == "r_c":
+        assert survivor.subsystem == "carbo"
+
+
+def test_merge_compartments_keeps_single_met_reactions_when_asked():
+    """drop_single_metabolite_reactions=False keeps the collapsed transport (now A → A,
+    which is empty stoichiometry after net-cancellation — still dropped, but the *one-met*
+    case is the more interesting one). Use a uniport pattern to exercise it."""
+    m = cobra.Model("uniport")
+    A_c = cobra.Metabolite("A_c", name="A", compartment="c")
+    A_m = cobra.Metabolite("A_m", name="A", compartment="m")
+    H_c = cobra.Metabolite("H_c", name="H", compartment="c")
+    m.add_metabolites([A_c, A_m, H_c])
+    # H+ symport: A_c + H_c → A_m. After merge: A + H → A → leaves H.
+    sym = cobra.Reaction("sym", lower_bound=0, upper_bound=1000)
+    sym.add_metabolites({A_c: -1, H_c: -1, A_m: 1})
+    m.add_reactions([sym])
+    merged_drop, deleted_drop, _ = merge_compartments(m, drop_single_metabolite_reactions=True)
+    assert "sym" in deleted_drop
+    merged_keep, deleted_keep, _ = merge_compartments(m, drop_single_metabolite_reactions=False)
+    # With keep, sym survives as a one-met reaction (consumes H).
+    assert "sym" not in deleted_keep
+    assert "sym" in {r.id for r in merged_keep.reactions}
+
+
+def test_merge_compartments_deduplicate_off_keeps_both():
+    m = _two_compartment_model()
+    merged, _, dupes = merge_compartments(m, deduplicate_reactions=False)
+    assert dupes == []
+    assert {"r_c", "r_m"} <= {r.id for r in merged.reactions}
+
+
+# ----------------------------------------------------------------- copy_to_compartment
+
+def test_copy_to_compartment_basic():
+    """Copy r_c into 'p' (peroxisome): a new reaction r_c_p with metabolites in p."""
+    m = _two_compartment_model()
+    out, new_rxns, new_mets = copy_to_compartment(m, ["r_c"], "p",
+                                                    target_compartment_name="peroxisome")
+    assert "r_c_p" in [r.id for r in out.reactions]
+    new_r = out.reactions.r_c_p
+    assert {x.compartment for x in new_r.metabolites} == {"p"}
+    assert "A_p" in [x.id for x in out.metabolites]
+    assert "B_p" in [x.id for x in out.metabolites]
+    assert new_rxns == ["r_c_p"]
+    assert set(new_mets) == {"A_p", "B_p"}
+    # Original still there.
+    assert "r_c" in [r.id for r in out.reactions]
+
+
+def test_copy_to_compartment_preserves_gpr_and_bounds():
+    m = _two_compartment_model()
+    out, _, _ = copy_to_compartment(m, ["r_c"], "p")
+    new_r = out.reactions.r_c_p
+    assert new_r.gene_reaction_rule == "g1"
+    assert new_r.lower_bound == 0 and new_r.upper_bound == 1000
+
+
+def test_copy_to_compartment_delete_original_is_a_move():
+    m = _two_compartment_model()
+    out, _, _ = copy_to_compartment(m, ["r_c"], "p", delete_original=True)
+    assert "r_c" not in [r.id for r in out.reactions]
+    assert "r_c_p" in [r.id for r in out.reactions]
+
+
+def test_copy_to_compartment_idempotent():
+    """Calling twice doesn't add the reaction twice."""
+    m = _two_compartment_model()
+    out, _, _ = copy_to_compartment(m, ["r_c"], "p")
+    out2, new_rxns, _ = copy_to_compartment(out, ["r_c"], "p")
+    assert new_rxns == []  # nothing added on second call
+    assert len([r for r in out2.reactions if r.id == "r_c_p"]) == 1
+
+
+def test_copy_to_compartment_unknown_reaction_raises():
+    m = _two_compartment_model()
+    with pytest.raises(ValueError, match="not in model"):
+        copy_to_compartment(m, ["does_not_exist"], "p")
+
+
+def test_copy_to_compartment_custom_suffix():
+    m = _two_compartment_model()
+    out, new_rxns, _ = copy_to_compartment(m, ["r_c"], "p", id_suffix="copy1")
+    assert new_rxns == ["r_c_copy1"]
+    assert "A_copy1" in [x.id for x in out.metabolites]
diff --git a/tests/test_manipulation_expand.py b/tests/test_manipulation_expand.py
new file mode 100644
index 0000000..08cd2f2
--- /dev/null
+++ b/tests/test_manipulation_expand.py
@@ -0,0 +1,288 @@
+"""Tests for expand_model (RAVEN expandModel.m) — splitting isozymes into reactions.
+
+Adopted from geckopy's tests/test_expand.py.
+"""
+import cobra
+
+from raven_python.manipulation import expand_model
+from raven_python.manipulation.expand import _gpr_to_dnf
+
+# --------------------------------------------------------------------------- #
+# DNF conversion (internal helper, worth testing directly)
+# --------------------------------------------------------------------------- #
+
+def _dnf_from_gpr_string(gpr_str: str) -> list[list[str]]:
+    from cobra.core.gene import GPR
+
+    gpr = GPR.from_string(gpr_str)
+    return _gpr_to_dnf(gpr)
+
+
+def test_dnf_empty_gpr():
+    assert _dnf_from_gpr_string("") == []
+
+
+def test_dnf_single_gene():
+    assert _dnf_from_gpr_string("g1") == [["g1"]]
+
+
+def test_dnf_simple_and():
+    assert _dnf_from_gpr_string("g1 and g2") == [["g1", "g2"]]
+
+
+def test_dnf_simple_or():
+    assert _dnf_from_gpr_string("g1 or g2") == [["g1"], ["g2"]]
+
+
+def test_dnf_or_of_ands():
+    assert _dnf_from_gpr_string("(g1 and g2) or (g3 and g4)") == [
+        ["g1", "g2"],
+        ["g3", "g4"],
+    ]
+
+
+def test_dnf_distributes_and_over_or():
+    result = _dnf_from_gpr_string("g1 and (g2 or g3)")
+    assert result == [["g1", "g2"], ["g1", "g3"]]
+
+
+def test_dnf_triple_or():
+    assert _dnf_from_gpr_string("g1 or g2 or g3") == [
+        ["g1"], ["g2"], ["g3"],
+    ]
+
+
+def test_dnf_preserves_gene_order_within_clause():
+    result = _dnf_from_gpr_string("g3 and g1 and g2")
+    assert result == [["g3", "g1", "g2"]]
+
+
+# --------------------------------------------------------------------------- #
+# expand_model
+# --------------------------------------------------------------------------- #
+
+def _build_model(
+    reactions: list[tuple[str, dict[str, float], float, float, str]],
+) -> cobra.Model:
+    """Build from (rxn_id, {met_id: coef}, lb, ub, gpr) tuples."""
+    model = cobra.Model("test")
+    mets: dict[str, cobra.Metabolite] = {}
+    for _, stoich, _, _, _ in reactions:
+        for met_id in stoich:
+            if met_id not in mets:
+                mets[met_id] = cobra.Metabolite(met_id, compartment="c")
+
+    for rxn_id, stoich, lb, ub, gpr in reactions:
+        rxn = cobra.Reaction(rxn_id)
+        rxn.lower_bound = lb
+        rxn.upper_bound = ub
+        rxn.add_metabolites({mets[m]: c for m, c in stoich.items()})
+        if gpr:
+            rxn.gene_reaction_rule = gpr
+        model.add_reactions([rxn])
+    return model
+
+
+def test_does_not_expand_reaction_without_gpr():
+    model = _build_model([("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "")])
+    added = expand_model(model)
+    assert added == []
+    assert "r1" in {r.id for r in model.reactions}
+
+
+def test_does_not_expand_single_and_clause():
+    model = _build_model([
+        ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 and g2"),
+    ])
+    added = expand_model(model)
+    assert added == []
+    r1 = model.reactions.get_by_id("r1")
+    assert r1.gene_reaction_rule == "g1 and g2"
+
+
+def test_does_not_expand_single_gene():
+    model = _build_model([("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1")])
+    added = expand_model(model)
+    assert added == []
+    assert model.reactions.get_by_id("r1").gene_reaction_rule == "g1"
+
+
+def test_splits_simple_or_into_two_reactions():
+    model = _build_model([
+        ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"),
+    ])
+    added = expand_model(model)
+
+    assert added == ["r1_EXP_1", "r1_EXP_2"]
+    rxn_ids = {r.id for r in model.reactions}
+    assert "r1" not in rxn_ids
+    assert "r1_EXP_1" in rxn_ids
+    assert "r1_EXP_2" in rxn_ids
+
+    assert model.reactions.get_by_id("r1_EXP_1").gene_reaction_rule == "g1"
+    assert model.reactions.get_by_id("r1_EXP_2").gene_reaction_rule == "g2"
+
+
+def test_splits_or_of_ands():
+    model = _build_model([
+        ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0,
+         "(g1 and g2) or (g3 and g4)"),
+    ])
+    added = expand_model(model)
+
+    assert added == ["r1_EXP_1", "r1_EXP_2"]
+    assert model.reactions.get_by_id("r1_EXP_1").gene_reaction_rule == "g1 and g2"
+    assert model.reactions.get_by_id("r1_EXP_2").gene_reaction_rule == "g3 and g4"
+
+
+def test_distributes_and_over_or():
+    model = _build_model([
+        ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0,
+         "g1 and (g2 or g3)"),
+    ])
+    added = expand_model(model)
+
+    assert added == ["r1_EXP_1", "r1_EXP_2"]
+    assert model.reactions.get_by_id("r1_EXP_1").gene_reaction_rule == "g1 and g2"
+    assert model.reactions.get_by_id("r1_EXP_2").gene_reaction_rule == "g1 and g3"
+
+
+def test_expanded_reactions_inherit_stoichiometry_and_bounds():
+    model = _build_model([
+        ("r1", {"A": -1.0, "B": 2.0}, -500.0, 1500.0, "g1 or g2"),
+    ])
+    expand_model(model)
+
+    for suffix in ("_EXP_1", "_EXP_2"):
+        rxn = model.reactions.get_by_id(f"r1{suffix}")
+        assert rxn.bounds == (-500.0, 1500.0)
+        stoich = {m.id: c for m, c in rxn.metabolites.items()}
+        assert stoich == {"A": -1.0, "B": 2.0}
+
+
+def test_expanded_reactions_inherit_name_and_subsystem():
+    model = _build_model([
+        ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"),
+    ])
+    r1 = model.reactions.get_by_id("r1")
+    r1.name = "an isozyme-catalyzed reaction"
+    r1.subsystem = "central metabolism"
+
+    expand_model(model)
+
+    for suffix in ("_EXP_1", "_EXP_2"):
+        rxn = model.reactions.get_by_id(f"r1{suffix}")
+        assert rxn.name == "an isozyme-catalyzed reaction"
+        assert rxn.subsystem == "central metabolism"
+
+
+def test_multiple_reactions_expand_independently():
+    model = _build_model([
+        ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"),
+        ("r2", {"B": -1.0, "C": 1.0}, 0.0, 1000.0, "g3 and g4"),
+        ("r3", {"C": -1.0, "D": 1.0}, 0.0, 1000.0,
+         "(g5 and g6) or g7 or (g8 and g9)"),
+    ])
+    added = expand_model(model)
+
+    assert added == sorted([
+        "r1_EXP_1", "r1_EXP_2",
+        "r3_EXP_1", "r3_EXP_2", "r3_EXP_3",
+    ])
+
+    rxn_ids = {r.id for r in model.reactions}
+    assert "r2" in rxn_ids
+    assert "r1" not in rxn_ids
+    assert "r3" not in rxn_ids
+
+    assert model.reactions.get_by_id("r2").gene_reaction_rule == "g3 and g4"
+    assert model.reactions.get_by_id("r3_EXP_1").gene_reaction_rule == "g5 and g6"
+    assert model.reactions.get_by_id("r3_EXP_2").gene_reaction_rule == "g7"
+    assert model.reactions.get_by_id("r3_EXP_3").gene_reaction_rule == "g8 and g9"
+
+
+def test_expanded_reaction_has_correct_gene_set():
+    model = _build_model([
+        ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0,
+         "(g1 and g2) or (g3 and g4)"),
+    ])
+    expand_model(model)
+
+    r1_1 = model.reactions.get_by_id("r1_EXP_1")
+    assert {g.id for g in r1_1.genes} == {"g1", "g2"}
+
+    r1_2 = model.reactions.get_by_id("r1_EXP_2")
+    assert {g.id for g in r1_2.genes} == {"g3", "g4"}
+
+
+def test_expansion_is_idempotent_in_the_no_op_sense():
+    model = _build_model([
+        ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"),
+        ("r2", {"B": -1.0, "C": 1.0}, 0.0, 1000.0, "g3 and g4"),
+    ])
+    expand_model(model)
+    ids_before = {r.id for r in model.reactions}
+
+    second = expand_model(model)
+    assert second == []
+
+    ids_after = {r.id for r in model.reactions}
+    assert ids_after == ids_before
+
+
+def test_empty_model_is_unchanged():
+    model = cobra.Model("empty")
+    assert expand_model(model) == []
+
+
+# --------------------------------------------------------------------------- #
+# Annotation and notes propagation
+# --------------------------------------------------------------------------- #
+
+def test_expanded_reactions_inherit_annotation_and_notes():
+    model = _build_model([
+        ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"),
+    ])
+    r1 = model.reactions.get_by_id("r1")
+    r1.annotation["ec-code"] = "1.2.3.4"
+    r1.annotation["sbo"] = "SBO:0000176"
+    r1.notes["custom"] = "hello"
+
+    expand_model(model)
+
+    for suffix in ("_EXP_1", "_EXP_2"):
+        rxn = model.reactions.get_by_id(f"r1{suffix}")
+        assert rxn.annotation["ec-code"] == "1.2.3.4"
+        assert rxn.annotation["sbo"] == "SBO:0000176"
+        assert rxn.notes["custom"] == "hello"
+
+
+def test_expanded_reaction_annotation_is_independent_of_parent():
+    """Mutating one expanded reaction's annotation must not affect siblings."""
+    model = _build_model([
+        ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"),
+    ])
+    model.reactions.get_by_id("r1").annotation["ec-code"] = ["1.2.3.4"]
+
+    expand_model(model)
+
+    r1_1 = model.reactions.get_by_id("r1_EXP_1")
+    r1_2 = model.reactions.get_by_id("r1_EXP_2")
+    r1_1.annotation["ec-code"].append("9.9.9.9")
+    assert r1_2.annotation["ec-code"] == ["1.2.3.4"]
+
+
+def test_objective_coefficient_preserved_on_expansion():
+    """An expanded reaction's isozyme copies retain the original objective coefficient."""
+    m = cobra.Model("o")
+    a, b = (cobra.Metabolite(x, compartment="c") for x in "ab")
+    m.add_metabolites([a, b])
+    r = cobra.Reaction("r1", lower_bound=0, upper_bound=1000)
+    r.add_metabolites({a: -1, b: 1})
+    r.gene_reaction_rule = "g1 or g2"
+    m.add_reactions([r])
+    m.objective = "r1"  # objective on the soon-to-be-expanded reaction
+
+    expand_model(m)
+    coeffs = {rx.id: rx.objective_coefficient for rx in m.reactions}
+    assert coeffs == {"r1_EXP_1": 1.0, "r1_EXP_2": 1.0}  # objective survives on both copies
diff --git a/tests/test_manipulation_irreversible.py b/tests/test_manipulation_irreversible.py
new file mode 100644
index 0000000..e211fa3
--- /dev/null
+++ b/tests/test_manipulation_irreversible.py
@@ -0,0 +1,144 @@
+"""Tests for convert_to_irreversible (RAVEN convertToIrrev.m).
+
+Adopted from geckopy's tests/test_preprocess.py (the convert_to_irreversible subset).
+Exchange reactions are excluded from the split, matching MATLAB behavior.
+"""
+import cobra
+
+from raven_python.manipulation import convert_to_irreversible
+
+
+def _build_model_with_bounds(
+    reactions: list[tuple[str, dict[str, float], float, float]],
+) -> cobra.Model:
+    """Build from (rxn_id, {met_id: coef}, lb, ub) tuples."""
+    model = cobra.Model("test")
+    mets: dict[str, cobra.Metabolite] = {}
+    for _, stoich, _, _ in reactions:
+        for met_id in stoich:
+            if met_id not in mets:
+                mets[met_id] = cobra.Metabolite(met_id, compartment="c")
+
+    for rxn_id, stoich, lb, ub in reactions:
+        rxn = cobra.Reaction(rxn_id)
+        rxn.lower_bound = lb
+        rxn.upper_bound = ub
+        rxn.add_metabolites({mets[m]: c for m, c in stoich.items()})
+        model.add_reactions([rxn])
+    return model
+
+
+def test_splits_single_reversible_non_exchange():
+    model = _build_model_with_bounds([
+        ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0),
+    ])
+
+    added = convert_to_irreversible(model)
+    assert added == ["r1_REV"]
+
+    fwd = model.reactions.get_by_id("r1")
+    rev = model.reactions.get_by_id("r1_REV")
+
+    assert fwd.bounds == (0.0, 1000.0)
+    assert {m.id: c for m, c in fwd.metabolites.items()} == {"A": -1.0, "B": 1.0}
+
+    assert rev.bounds == (0.0, 500.0)
+    assert {m.id: c for m, c in rev.metabolites.items()} == {"A": 1.0, "B": -1.0}
+
+
+def test_does_not_split_forward_only_reaction():
+    model = _build_model_with_bounds([
+        ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0),
+    ])
+    added = convert_to_irreversible(model)
+    assert added == []
+    assert "r1_REV" not in {r.id for r in model.reactions}
+
+
+def test_does_not_split_exchange_reaction_even_if_reversible():
+    """Exchange reactions (one metabolite) are explicitly excluded from
+    the irreversibility step in MATLAB, regardless of bounds."""
+    model = _build_model_with_bounds([
+        ("EX_A", {"A": -1.0}, -1000.0, 1000.0),
+    ])
+    added = convert_to_irreversible(model)
+    assert added == []
+    ex = model.reactions.get_by_id("EX_A")
+    assert ex.bounds == (-1000.0, 1000.0)
+
+
+def test_splits_multiple_mixed_reactions():
+    model = _build_model_with_bounds([
+        ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0),   # split
+        ("r2", {"B": -2.0, "C": 3.0}, 0.0, 1000.0),      # forward only
+        ("EX_A", {"A": -1.0}, -1000.0, 1000.0),          # exchange
+        ("r3", {"C": -1.0, "D": 1.0}, -200.0, 200.0),    # split
+    ])
+
+    added = convert_to_irreversible(model)
+    assert added == ["r1_REV", "r3_REV"]
+
+    assert model.reactions.get_by_id("r1").bounds == (0.0, 1000.0)
+    assert model.reactions.get_by_id("r1_REV").bounds == (0.0, 500.0)
+    assert model.reactions.get_by_id("r2").bounds == (0.0, 1000.0)
+    assert model.reactions.get_by_id("EX_A").bounds == (-1000.0, 1000.0)
+    assert model.reactions.get_by_id("r3").bounds == (0.0, 200.0)
+    assert model.reactions.get_by_id("r3_REV").bounds == (0.0, 200.0)
+
+
+def test_reverse_reaction_inherits_gpr():
+    model = _build_model_with_bounds([
+        ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0),
+    ])
+    model.reactions.get_by_id("r1").gene_reaction_rule = "g1 and g2"
+
+    convert_to_irreversible(model)
+
+    rev = model.reactions.get_by_id("r1_REV")
+    assert rev.gene_reaction_rule == "g1 and g2"
+    assert {g.id for g in rev.genes} == {"g1", "g2"}
+
+
+def test_forward_reaction_lb_is_clamped_to_zero():
+    """After splitting, the original reaction should have lb = 0,
+    which is what MATLAB's convertToIrrev does."""
+    model = _build_model_with_bounds([
+        ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0),
+    ])
+    convert_to_irreversible(model)
+    assert model.reactions.get_by_id("r1").lower_bound == 0.0
+
+
+def test_no_reverse_reaction_has_negative_bound():
+    """After conversion, no non-exchange reaction may carry negative flux."""
+    model = _build_model_with_bounds([
+        ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0),
+        ("r2", {"B": -1.0, "C": 1.0}, -1000.0, 0.0),      # blocked reverse
+        ("EX_A", {"A": -1.0}, -1000.0, 1000.0),
+    ])
+    convert_to_irreversible(model)
+    for rxn in model.reactions:
+        if rxn.boundary:
+            continue
+        assert rxn.lower_bound >= 0, f"{rxn.id} still has lb < 0"
+
+
+def test_returns_empty_list_when_nothing_to_split():
+    model = _build_model_with_bounds([
+        ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0),
+        ("EX_A", {"A": -1.0}, -1000.0, 1000.0),
+    ])
+    assert convert_to_irreversible(model) == []
+
+
+def test_conversion_is_idempotent_after_first_pass():
+    """Running convert_to_irreversible twice should not create
+    `_REV_REV` reactions, because the first pass already clamped
+    all non-exchange lb to 0."""
+    model = _build_model_with_bounds([
+        ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0),
+    ])
+    convert_to_irreversible(model)
+    second = convert_to_irreversible(model)
+    assert second == []
+    assert "r1_REV_REV" not in {r.id for r in model.reactions}
diff --git a/tests/test_manipulation_merge.py b/tests/test_manipulation_merge.py
new file mode 100644
index 0000000..a430f6e
--- /dev/null
+++ b/tests/test_manipulation_merge.py
@@ -0,0 +1,136 @@
+"""Tests for merge_models (mergeModels port)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import add_reactions_from_equations, merge_models
+
+
+def _model(mid, mets, reactions):
+    m = cobra.Model(mid)
+    m.add_metabolites(mets)
+    add_reactions_from_equations(m, reactions)
+    return m
+
+
+@pytest.fixture
+def model_a():
+    return _model(
+        "A",
+        [
+            cobra.Metabolite("glc_c", name="Glucose", compartment="c"),
+            cobra.Metabolite("g6p_c", name="G6P", compartment="c"),
+        ],
+        [{"id": "HEX", "equation": "glc_c --> g6p_c", "gene_reaction_rule": "GA"}],
+    )
+
+
+@pytest.fixture
+def model_b():
+    # same Glucose[c] compound but a DIFFERENT id
+    return _model(
+        "B",
+        [
+            cobra.Metabolite("glucose_c", name="Glucose", compartment="c"),
+            cobra.Metabolite("lac_c", name="Lactate", compartment="c"),
+        ],
+        [{"id": "LDH", "equation": "glucose_c --> lac_c", "gene_reaction_rule": "GB"}],
+    )
+
+
+def test_unifies_metabolites_by_name_comp(model_a, model_b):
+    merged = merge_models([model_a, model_b])
+    glucoses = [m for m in merged.metabolites if m.name == "Glucose" and m.compartment == "c"]
+    assert len(glucoses) == 1  # glc_c and glucose_c unified
+    # both reactions reference the same merged Glucose object
+    hex_glc = [m for m in merged.reactions.get_by_id("HEX").metabolites if m.name == "Glucose"][0]
+    ldh_glc = [m for m in merged.reactions.get_by_id("LDH").metabolites if m.name == "Glucose"][0]
+    assert hex_glc is ldh_glc
+
+
+def test_match_by_id_keeps_distinct(model_a, model_b):
+    merged = merge_models([model_a, model_b], match_by="id")
+    glucoses = [m for m in merged.metabolites if m.name == "Glucose"]
+    assert len(glucoses) == 2  # glc_c and glucose_c are distinct by id
+
+
+def test_all_reactions_kept(model_a, model_b):
+    merged = merge_models([model_a, model_b])
+    assert {"HEX", "LDH"} <= {r.id for r in merged.reactions}
+
+
+def test_reaction_id_collision_renamed(model_a):
+    # two models with the same reaction id but different chemistry
+    other = _model(
+        "B",
+        [cobra.Metabolite("glc_c", name="Glucose", compartment="c"),
+         cobra.Metabolite("x_c", name="X", compartment="c")],
+        [{"id": "HEX", "equation": "glc_c --> x_c"}],
+    )
+    merged = merge_models([model_a, other])
+    assert "HEX" in {r.id for r in merged.reactions}
+    assert "HEX_B" in {r.id for r in merged.reactions}  # renamed with source id
+
+
+def test_genes_merged(model_a, model_b):
+    merged = merge_models([model_a, model_b])
+    assert {"GA", "GB"} <= {g.id for g in merged.genes}
+
+
+def test_provenance_recorded(model_a, model_b):
+    merged = merge_models([model_a, model_b])
+    assert merged.reactions.get_by_id("HEX").notes["origin"] == "A"
+    assert merged.reactions.get_by_id("LDH").notes["origin"] == "B"
+    assert merged.genes.get_by_id("GA").notes["origin"] == "A"
+
+
+def test_compartments_preserved(model_a):
+    model_a.compartments = {"c": "cytoplasm"}
+    merged = merge_models([model_a, model_a.copy()])
+    assert merged.compartments.get("c") == "cytoplasm"
+
+
+def test_single_model_returns_copy(model_a):
+    merged = merge_models([model_a])
+    assert merged is not model_a
+    assert {r.id for r in merged.reactions} == {r.id for r in model_a.reactions}
+
+
+def test_three_models(model_a, model_b):
+    c = _model("C", [cobra.Metabolite("co2_c", name="CO2", compartment="c")],
+               [{"id": "SINK", "equation": "co2_c -->"}])
+    merged = merge_models([model_a, model_b, c])
+    assert {"HEX", "LDH", "SINK"} <= {r.id for r in merged.reactions}
+
+
+def test_bad_match_by(model_a, model_b):
+    with pytest.raises(ValueError, match="match_by"):
+        merge_models([model_a, model_b], match_by="oops")
+
+
+# --- regression: formula/charge conflict (known_issues.md B1) --------------
+
+def test_formula_conflict_warns():
+    """Two models sharing a name[comp] but with different formulas warn instead
+    of silently keeping the first."""
+    a = _model("A",
+        [cobra.Metabolite("g1", name="Glucose", formula="C6H12O6", compartment="c")],
+        [{"id": "EX_A", "equation": "g1 -->"}])
+    b = _model("B",
+        [cobra.Metabolite("g2", name="Glucose", formula="C6H12O7", compartment="c")],
+        [{"id": "EX_B", "equation": "g2 -->"}])
+    with pytest.warns(UserWarning, match="different formulas"):
+        merged = merge_models([a, b])
+    # The merge still picks the first-seen — the test asserts the warning fired
+    # and the model survives.
+    assert "EX_A" in merged.reactions and "EX_B" in merged.reactions
+
+
+def test_charge_conflict_warns():
+    a = _model("A",
+        [cobra.Metabolite("g1", name="Glucose", formula="C6H12O6", charge=0, compartment="c")],
+        [{"id": "EX_A", "equation": "g1 -->"}])
+    b = _model("B",
+        [cobra.Metabolite("g2", name="Glucose", formula="C6H12O6", charge=-1, compartment="c")],
+        [{"id": "EX_B", "equation": "g2 -->"}])
+    with pytest.warns(UserWarning, match="different charges"):
+        merge_models([a, b])
diff --git a/tests/test_manipulation_remove.py b/tests/test_manipulation_remove.py
new file mode 100644
index 0000000..2b659b9
--- /dev/null
+++ b/tests/test_manipulation_remove.py
@@ -0,0 +1,97 @@
+"""Tests for raven_python.manipulation.remove (removeMets/removeGenes ports)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import (
+    add_reactions_from_equations,
+    remove_genes,
+    remove_metabolites,
+)
+
+
+@pytest.fixture
+def model():
+    m = cobra.Model("t")
+    m.add_metabolites(
+        [
+            cobra.Metabolite("atp_c", name="ATP", compartment="c"),
+            cobra.Metabolite("atp_m", name="ATP", compartment="m"),
+            cobra.Metabolite("adp_c", name="ADP", compartment="c"),
+            cobra.Metabolite("x_c", name="X", compartment="c"),
+        ]
+    )
+    add_reactions_from_equations(
+        m,
+        [
+            {"id": "R1", "equation": "atp_c --> adp_c", "gene_reaction_rule": "G1 and G2"},
+            {"id": "R2", "equation": "atp_c --> x_c", "gene_reaction_rule": "G3 or G4"},
+            {"id": "R3", "equation": "atp_m --> adp_c"},  # no GPR (spontaneous)
+        ],
+    )
+    return m
+
+
+# --- remove_metabolites ----------------------------------------------------
+
+def test_remove_metabolites_by_id(model):
+    remove_metabolites(model, ["x_c"])
+    assert "x_c" not in model.metabolites
+    # reaction kept, just lost the metabolite
+    assert "R2" in model.reactions
+
+
+def test_remove_metabolites_by_name_across_compartments(model):
+    # "ATP" exists in c and m; by_name removes both at once.
+    remove_metabolites(model, ["ATP"], by_name=True)
+    assert "atp_c" not in model.metabolites
+    assert "atp_m" not in model.metabolites
+    assert "adp_c" in model.metabolites
+
+
+def test_remove_metabolites_destructive(model):
+    remove_metabolites(model, ["adp_c"], destructive=True)
+    # R1 and R3 both produced adp_c -> removed
+    assert "adp_c" not in model.metabolites
+    assert "R1" not in model.reactions and "R3" not in model.reactions
+
+
+# --- remove_genes ----------------------------------------------------------
+
+def test_remove_genes_remove_mode(model):
+    blocked = remove_genes(model, ["G1"], blocked_reactions="remove")
+    # R1 = "G1 and G2": removing G1 breaks the complex -> blocked -> removed
+    assert blocked == ["R1"]
+    assert "R1" not in model.reactions
+    assert "R2" in model.reactions  # OR rule unaffected
+
+
+def test_remove_genes_constrain_mode(model):
+    blocked = remove_genes(model, ["G1"], blocked_reactions="constrain")
+    assert blocked == ["R1"]
+    r1 = model.reactions.get_by_id("R1")
+    assert r1.bounds == (0, 0)  # kept but constrained, per RAVEN default
+    assert r1.gene_reaction_rule == ""
+
+
+def test_remove_genes_keep_mode(model):
+    blocked = remove_genes(model, ["G1"], blocked_reactions="keep")
+    assert blocked == ["R1"]
+    r1 = model.reactions.get_by_id("R1")
+    assert r1.gene_reaction_rule == ""
+    assert r1.bounds != (0, 0)  # left untouched
+
+
+def test_remove_genes_or_rule_not_blocked(model):
+    blocked = remove_genes(model, ["G3"], blocked_reactions="remove")
+    # R2 = "G3 or G4": removing G3 leaves G4 -> not blocked
+    assert blocked == []
+    assert model.reactions.get_by_id("R2").gene_reaction_rule == "G4"
+
+
+def test_remove_genes_absent_gene_is_noop(model):
+    assert remove_genes(model, ["NOPE"]) == []
+
+
+def test_remove_genes_bad_policy(model):
+    with pytest.raises(ValueError, match="blocked_reactions"):
+        remove_genes(model, ["G1"], blocked_reactions="explode")
diff --git a/tests/test_manipulation_simplify.py b/tests/test_manipulation_simplify.py
new file mode 100644
index 0000000..586a0c3
--- /dev/null
+++ b/tests/test_manipulation_simplify.py
@@ -0,0 +1,184 @@
+"""Tests for simplifyModel reduction modes."""
+import cobra
+import pytest
+
+from raven_python.manipulation import (
+    add_reactions_from_equations,
+    constrain_reversible_reactions,
+    group_linear_reactions,
+    remove_dead_end_reactions,
+    remove_duplicate_reactions,
+)
+
+# --- remove_dead_end_reactions --------------------------------------------
+
+def test_dead_end_removed():
+    m = cobra.Model("t")
+    m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b", "dead")])
+    add_reactions_from_equations(
+        m,
+        [
+            {"id": "R_in", "equation": " --> a"},
+            {"id": "R1", "equation": "a --> b"},
+            {"id": "R_out", "equation": "b --> "},
+            {"id": "R_dead", "equation": "a --> dead"},  # 'dead' only produced
+        ],
+    )
+    removed_rxns, removed_mets = remove_dead_end_reactions(m)
+    assert "R_dead" in removed_rxns
+    assert "dead" in removed_mets
+    # the productive path survives
+    assert {"R_in", "R1", "R_out"} <= {r.id for r in m.reactions}
+
+
+def test_dead_end_respects_reserved():
+    m = cobra.Model("t")
+    m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "dead")])
+    add_reactions_from_equations(
+        m, [{"id": "R_in", "equation": " --> a"}, {"id": "R_dead", "equation": "a --> dead"}]
+    )
+    removed_rxns, _ = remove_dead_end_reactions(m, reserved=["R_dead"])
+    assert "R_dead" not in removed_rxns
+    assert "R_dead" in {r.id for r in m.reactions}
+
+
+# --- remove_duplicate_reactions -------------------------------------------
+
+def test_duplicates_removed():
+    m = cobra.Model("t")
+    m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b")])
+    add_reactions_from_equations(
+        m,
+        [
+            {"id": "R1", "equation": "a --> b", "bounds": (0, 1000)},
+            {"id": "R2", "equation": "a --> b", "bounds": (0, 1000)},  # duplicate of R1
+            {"id": "R3", "equation": "a --> b", "bounds": (0, 500)},   # different bounds
+        ],
+    )
+    removed = remove_duplicate_reactions(m)
+    assert len(removed) == 1  # one of R1/R2 removed
+    assert {"R3"} <= {r.id for r in m.reactions}
+    assert sum(r.id in ("R1", "R2") for r in m.reactions) == 1
+
+
+def test_duplicates_keep_reserved():
+    m = cobra.Model("t")
+    m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b")])
+    add_reactions_from_equations(
+        m,
+        [
+            {"id": "R1", "equation": "a --> b", "bounds": (0, 1000)},
+            {"id": "R2", "equation": "a --> b", "bounds": (0, 1000)},
+        ],
+    )
+    remove_duplicate_reactions(m, reserved=["R1"])
+    assert "R1" in {r.id for r in m.reactions}  # reserved one kept
+
+
+# --- constrain_reversible_reactions ---------------------------------------
+
+def test_forward_only_reversible_constrained():
+    m = cobra.Model("t")
+    m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b")])
+    add_reactions_from_equations(
+        m,
+        [
+            {"id": "R_in", "equation": " --> a", "bounds": (0, 1000)},
+            {"id": "R1", "equation": "a <=> b", "bounds": (-1000, 1000)},  # can only go fwd
+            {"id": "R_out", "equation": "b --> ", "bounds": (0, 1000)},
+        ],
+    )
+    changed = constrain_reversible_reactions(m)
+    assert "R1" in changed
+    assert m.reactions.get_by_id("R1").lower_bound == 0  # constrained to forward
+
+
+def test_truly_reversible_unchanged():
+    m = cobra.Model("t")
+    m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b")])
+    add_reactions_from_equations(
+        m,
+        [
+            {"id": "R_in", "equation": " <=> a", "bounds": (-1000, 1000)},
+            {"id": "R1", "equation": "a <=> b", "bounds": (-1000, 1000)},
+            {"id": "R_out", "equation": "b <=> ", "bounds": (-1000, 1000)},
+        ],
+    )
+    changed = constrain_reversible_reactions(m)
+    assert "R1" not in changed  # can go both ways
+
+
+# --- group_linear_reactions -----------------------------------------------
+
+def test_linear_chain_merged():
+    m = cobra.Model("t")
+    m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b", "c")])
+    add_reactions_from_equations(
+        m,
+        [
+            {"id": "R1", "equation": "a --> b"},  # b: single producer
+            {"id": "R2", "equation": "b --> c"},  # b: single consumer
+        ],
+    )
+    n_before = len(m.reactions)
+    group_linear_reactions(m)
+    # b is eliminated; R1+R2 merged into one reaction a --> c
+    assert "b" not in m.metabolites
+    assert len(m.reactions) < n_before
+
+
+def test_group_linear_discards_genes():
+    m = cobra.Model("t")
+    m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b", "c")])
+    add_reactions_from_equations(
+        m,
+        [
+            {"id": "R1", "equation": "a --> b", "gene_reaction_rule": "G1"},
+            {"id": "R2", "equation": "b --> c", "gene_reaction_rule": "G2"},
+        ],
+    )
+    group_linear_reactions(m)
+    assert len(m.genes) == 0
+
+
+# --- regression: incremental merge collapses a long chain (known_issues.md D1) ---
+
+def test_group_linear_merges_long_chain_in_one_pass():
+    """The incremental scan still flattens a 5-reaction linear chain — the
+    correctness property the original O(n²·m) restart-after-merge loop had."""
+    m = cobra.Model("t")
+    m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in "abcdef"])
+    add_reactions_from_equations(
+        m,
+        [
+            {"id": "R_in", "equation": " --> a"},
+            {"id": "R1", "equation": "a --> b"},
+            {"id": "R2", "equation": "b --> c"},
+            {"id": "R3", "equation": "c --> d"},
+            {"id": "R4", "equation": "d --> e"},
+            {"id": "R5", "equation": "e --> f"},
+            {"id": "R_out", "equation": "f --> "},
+        ],
+    )
+    group_linear_reactions(m)
+    # All the chain's internal metabolites are gone.
+    assert {x for x in m.metabolites if x.id in {"b", "c", "d", "e"}} == set()
+
+
+# --- regression: NaN FVA on infeasible model (known_issues.md C1) ----------
+
+def test_constrain_reversible_raises_on_infeasible():
+    """An infeasible model produces NaN FVA ranges; the old abs(NaN) < eps
+    check silently treated those as 'truly reversible'. Now raises."""
+    m = cobra.Model("t")
+    a, b = (cobra.Metabolite(x, compartment="c") for x in ("a", "b"))
+    m.add_metabolites([a, b])
+    # Force a contradiction: r requires production AND consumption of a, but
+    # nothing else produces a.
+    r = cobra.Reaction("r", lower_bound=-1, upper_bound=1)
+    r.add_metabolites({a: -1, b: 1})
+    forced = cobra.Reaction("forced", lower_bound=5, upper_bound=10)  # infeasible
+    forced.add_metabolites({a: -1})
+    m.add_reactions([r, forced])
+    with pytest.raises(RuntimeError, match="infeasible"):
+        constrain_reversible_reactions(m)
diff --git a/tests/test_manipulation_transfer.py b/tests/test_manipulation_transfer.py
new file mode 100644
index 0000000..61c2ac9
--- /dev/null
+++ b/tests/test_manipulation_transfer.py
@@ -0,0 +1,137 @@
+"""Tests for add_reactions_from_model (addRxnsGenesMets port)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import add_reactions_from_equations, add_reactions_from_model
+
+
+@pytest.fixture
+def draft():
+    m = cobra.Model("draft")
+    m.add_metabolites(
+        [cobra.Metabolite("glc_c", name="Glucose", formula="C6H12O6", compartment="c")]
+    )
+    # an existing reaction so glc_c is in use and we have an id to test skipping
+    add_reactions_from_equations(m, [{"id": "R_existing", "equation": "glc_c <=>"}])
+    return m
+
+
+@pytest.fixture
+def source():
+    m = cobra.Model("source")
+    m.add_metabolites(
+        [
+            # same name[comp] as draft's glc_c but a DIFFERENT id
+            cobra.Metabolite("glucose_c", name="Glucose", formula="C6H12O6", compartment="c"),
+            cobra.Metabolite("atp_c", name="ATP", formula="C10H16N5O13P3", charge=-4, compartment="c"),
+            cobra.Metabolite("g6p_c", name="G6P", formula="C6H13O9P", compartment="c"),
+        ]
+    )
+    add_reactions_from_equations(
+        m,
+        [
+            {
+                "id": "HEX",
+                "equation": "glucose_c + atp_c --> g6p_c",
+                "name": "hexokinase",
+                "bounds": (0, 1000),
+                "gene_reaction_rule": "G1",
+                "subsystem": "glycolysis",
+            },
+            {"id": "R_existing", "equation": "glucose_c <=>"},  # id already in draft
+        ],
+    )
+    return m
+
+
+def test_metabolite_matched_by_name_comp_not_id(draft, source):
+    add_reactions_from_model(draft, source, "HEX")
+    hex_rxn = draft.reactions.get_by_id("HEX")
+    # Glucose reused from the draft (id glc_c), NOT the source's glucose_c
+    assert "glc_c" in {m.id for m in hex_rxn.metabolites}
+    assert "glucose_c" not in draft.metabolites
+
+
+def test_new_metabolites_added_with_metadata(draft, source):
+    add_reactions_from_model(draft, source, "HEX")
+    assert "atp_c" in draft.metabolites and "g6p_c" in draft.metabolites
+    assert draft.metabolites.get_by_id("g6p_c").formula == "C6H13O9P"
+    assert draft.metabolites.get_by_id("atp_c").charge == -4
+
+
+def test_reaction_copied_with_bounds_and_name(draft, source):
+    (rxn,) = add_reactions_from_model(draft, source, "HEX")
+    assert rxn.id == "HEX"
+    assert rxn.name == "hexokinase"
+    assert rxn.bounds == (0, 1000)
+    assert rxn.subsystem == "glycolysis"
+    assert {m.id: rxn.get_coefficient(m.id) for m in rxn.metabolites} == {
+        "glc_c": -1.0,
+        "atp_c": -1.0,
+        "g6p_c": 1.0,
+    }
+
+
+def test_genes_true_copies_gpr_and_creates_genes(draft, source):
+    add_reactions_from_model(draft, source, "HEX", genes=True)
+    assert draft.reactions.get_by_id("HEX").gene_reaction_rule == "G1"
+    assert "G1" in draft.genes
+
+
+def test_genes_false_no_gpr(draft, source):
+    add_reactions_from_model(draft, source, "HEX", genes=False)
+    assert draft.reactions.get_by_id("HEX").gene_reaction_rule == ""
+
+
+def test_genes_string_override(draft, source):
+    add_reactions_from_model(draft, source, "HEX", genes="G9 or G10")
+    assert draft.reactions.get_by_id("HEX").gene_reaction_rule == "G9 or G10"
+
+
+def test_skips_already_present(draft, source):
+    added = add_reactions_from_model(draft, source, ["HEX", "R_existing"])
+    assert [r.id for r in added] == ["HEX"]
+
+
+def test_all_present_raises(draft, source):
+    with pytest.raises(ValueError, match="already in the model"):
+        add_reactions_from_model(draft, source, "R_existing")
+
+
+def test_unknown_source_reaction_raises(draft, source):
+    with pytest.raises(ValueError, match="not found in the source model"):
+        add_reactions_from_model(draft, source, "NOPE")
+
+
+def test_note_and_confidence_stored(draft, source):
+    (rxn,) = add_reactions_from_model(draft, source, "HEX", note="from KEGG", confidence=2)
+    assert rxn.notes["note"] == "from KEGG"
+    assert rxn.notes["confidence_score"] == 2
+
+
+# --- regression: intra-batch met-id minting collision (known_issues.md A3) ---
+
+def test_intra_batch_id_minting_unique():
+    """Two source mets whose ids both collide with the draft and whose name[comp]
+    differs both get routed through new-id minting. The fix tracks ids minted in
+    the current batch so the two don't collapse to the same generated id."""
+    draft = cobra.Model("draft")
+    draft.add_metabolites([
+        cobra.Metabolite("atp_c", name="ATP-draft", compartment="c"),
+        cobra.Metabolite("adp_c", name="ADP-draft", compartment="c"),
+    ])
+    source = cobra.Model("source")
+    source.add_metabolites([
+        cobra.Metabolite("atp_c", name="ATP-source", compartment="c"),
+        cobra.Metabolite("adp_c", name="ADP-source", compartment="c"),
+    ])
+    rxn = cobra.Reaction("R1", lower_bound=0, upper_bound=1000)
+    source.add_reactions([rxn])
+    rxn.add_metabolites({
+        source.metabolites.get_by_id("atp_c"): -1,
+        source.metabolites.get_by_id("adp_c"): 1,
+    })
+    add_reactions_from_model(draft, source, "R1")
+    # Both source mets minted distinct ids (m1 and m2) — not a collision.
+    new_ids = sorted(m.id for m in draft.metabolites if m.id not in ("atp_c", "adp_c"))
+    assert len(new_ids) == 2 and len(set(new_ids)) == 2
diff --git a/tests/test_manipulation_transport.py b/tests/test_manipulation_transport.py
new file mode 100644
index 0000000..e8fb2b6
--- /dev/null
+++ b/tests/test_manipulation_transport.py
@@ -0,0 +1,98 @@
+"""Tests for add_transport_reactions (addTransport port)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import add_transport_reactions
+
+
+@pytest.fixture
+def model():
+    m = cobra.Model("t")
+    m.compartments = {"c": "cytoplasm", "m": "mitochondrion", "e": "extracellular"}
+    m.add_metabolites(
+        [
+            cobra.Metabolite("atp_c", name="ATP", formula="C10H16N5O13P3", charge=-4, compartment="c"),
+            cobra.Metabolite("h2o_c", name="H2O", formula="H2O", compartment="c"),
+            cobra.Metabolite("atp_m", name="ATP", compartment="m"),  # exists in m
+        ]
+    )
+    return m
+
+
+def test_basic_transport_to_existing(model):
+    added = add_transport_reactions(model, "c", "m", ["ATP"])
+    assert len(added) == 1
+    rxn = added[0]
+    assert rxn.id == "tr_0001"
+    assert rxn.name == "ATP transport, cytoplasm-mitochondrion"
+    assert {m.id: rxn.get_coefficient(m.id) for m in rxn.metabolites} == {
+        "atp_c": -1.0,
+        "atp_m": 1.0,
+    }
+    assert rxn.reversibility is True
+
+
+def test_only_to_existing_skips_missing(model):
+    # H2O is not in m; with only_to_existing (default) it's skipped
+    added = add_transport_reactions(model, "c", "m", ["ATP", "H2O"])
+    assert [r.id for r in added] == ["tr_0001"]  # only ATP
+
+
+def test_creates_missing_target_metabolite(model):
+    added = add_transport_reactions(
+        model, "c", "m", ["H2O"], only_to_existing=False
+    )
+    assert len(added) == 1
+    new = [mt for mt in model.metabolites if mt.name == "H2O" and mt.compartment == "m"]
+    assert len(new) == 1
+    assert new[0].formula == "H2O"  # copied from source
+
+
+def test_copies_formula_and_charge(model):
+    add_transport_reactions(model, "c", "e", ["ATP"], only_to_existing=False)
+    new = [mt for mt in model.metabolites if mt.name == "ATP" and mt.compartment == "e"][0]
+    assert new.formula == "C10H16N5O13P3"
+    assert new.charge == -4
+
+
+def test_irreversible(model):
+    (rxn,) = add_transport_reactions(model, "c", "m", ["ATP"], reversible=False)
+    assert rxn.lower_bound == 0
+    assert rxn.reversibility is False
+
+
+def test_default_all_metabolites_in_from(model):
+    # default metabolite_names = all in c (ATP, H2O); to m, only_to_existing -> only ATP
+    added = add_transport_reactions(model, "c", "m")
+    assert [r.id for r in added] == ["tr_0001"]
+
+
+def test_multiple_target_compartments_and_sequential_ids(model):
+    added = add_transport_reactions(
+        model, "c", ["m", "e"], ["ATP"], only_to_existing=False
+    )
+    assert [r.id for r in added] == ["tr_0001", "tr_0002"]
+
+
+def test_unknown_compartment_raises(model):
+    with pytest.raises(ValueError, match="not in the model"):
+        add_transport_reactions(model, "x", "m", ["ATP"])
+
+
+def test_unknown_metabolite_raises(model):
+    with pytest.raises(ValueError, match="not found in compartment"):
+        add_transport_reactions(model, "c", "m", ["NOPE"])
+
+
+# --- regression: duplicate name in compartment (known_issues.md A4) --------
+
+def test_duplicate_name_in_source_compartment_warns(model):
+    """Two source mets sharing a name in the same compartment warn instead of
+    silently collapsing — previously one was dropped from the lookup dict."""
+    model.add_metabolites([
+        cobra.Metabolite("h2o2_c", name="H2O", compartment="c"),  # duplicate name
+    ])
+    with pytest.warns(UserWarning, match="Multiple metabolites named 'H2O'"):
+        added = add_transport_reactions(model, "c", "m", ["H2O"], only_to_existing=False)
+    # Transport still works (uses the first match) — the warning is the signal.
+    assert len(added) == 1
diff --git a/tests/test_parameters.py b/tests/test_parameters.py
new file mode 100644
index 0000000..c0ab06c
--- /dev/null
+++ b/tests/test_parameters.py
@@ -0,0 +1,60 @@
+"""Tests for set_variance_bounds (the var mode of setParam)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import add_reactions_from_equations, set_variance_bounds
+
+
+@pytest.fixture
+def model():
+    m = cobra.Model("t")
+    m.add_metabolites(
+        [cobra.Metabolite("a_c", compartment="c"), cobra.Metabolite("b_c", compartment="c")]
+    )
+    add_reactions_from_equations(
+        m,
+        [
+            {"id": "R1", "equation": "a_c <=> b_c"},
+            {"id": "R2", "equation": "a_c <=> b_c"},
+        ],
+    )
+    return m
+
+
+def test_band_positive(model):
+    set_variance_bounds(model, "R1", 100, 5)  # 97.5 .. 102.5
+    lb, ub = model.reactions.get_by_id("R1").bounds
+    assert lb == pytest.approx(97.5)
+    assert ub == pytest.approx(102.5)
+
+
+def test_band_negative_is_ordered(model):
+    set_variance_bounds(model, "R1", -100, 5)
+    lb, ub = model.reactions.get_by_id("R1").bounds
+    assert lb == pytest.approx(-102.5)
+    assert ub == pytest.approx(-97.5)
+    assert lb <= ub
+
+
+def test_broadcast_scalar(model):
+    set_variance_bounds(model, ["R1", "R2"], 50, 10)
+    for rid in ("R1", "R2"):
+        lb, ub = model.reactions.get_by_id(rid).bounds
+        assert lb == pytest.approx(47.5)
+        assert ub == pytest.approx(52.5)
+
+
+def test_per_reaction_values(model):
+    set_variance_bounds(model, ["R1", "R2"], [100, 200], 0)
+    assert model.reactions.get_by_id("R1").bounds == pytest.approx((100, 100))
+    assert model.reactions.get_by_id("R2").bounds == pytest.approx((200, 200))
+
+
+def test_length_mismatch_raises(model):
+    with pytest.raises(ValueError, match="to match the reactions"):
+        set_variance_bounds(model, ["R1", "R2"], [1, 2, 3], 5)
+
+
+def test_unknown_reaction_raises(model):
+    with pytest.raises(ValueError, match="not found"):
+        set_variance_bounds(model, "NOPE", 1, 5)

From a1dc557ece5d83d38466e96d676eeec66f90b1bb Mon Sep 17 00:00:00 2001
From: Eduard Kerkhoven <eduardk@chalmers.se>
Date: Fri, 29 May 2026 22:50:05 +0200
Subject: [PATCH 5/7] Add binary + data resolvers for external tools and
 published artefacts

---
 docs/maintaining_binaries.md     | 236 +++++++++++++++++++++++++++++++
 scripts/make_registry_snippet.py | 102 +++++++++++++
 src/raven_python/binaries.py     | 148 +++++++++++++++++++
 src/raven_python/data.py         | 135 ++++++++++++++++++
 tests/test_binaries.py           |  80 +++++++++++
 tests/test_data.py               |  89 ++++++++++++
 tests/test_scripts_registry.py   |  58 ++++++++
 7 files changed, 848 insertions(+)
 create mode 100644 docs/maintaining_binaries.md
 create mode 100644 scripts/make_registry_snippet.py
 create mode 100644 src/raven_python/binaries.py
 create mode 100644 src/raven_python/data.py
 create mode 100644 tests/test_binaries.py
 create mode 100644 tests/test_data.py
 create mode 100644 tests/test_scripts_registry.py

diff --git a/docs/maintaining_binaries.md b/docs/maintaining_binaries.md
new file mode 100644
index 0000000..df5b315
--- /dev/null
+++ b/docs/maintaining_binaries.md
@@ -0,0 +1,236 @@
+# Maintaining bundled binaries (BLAST+, DIAMOND, …)
+
+Audience: **raven-python maintainers / the GitHub repo owner.** This explains how
+raven-python ships external command-line tools, how to update their versions, and how
+to build **minimal-footprint** ZIPs to attach to a GitHub release.
+
+> End users never read this. They get a binary automatically via `ensure_binary`,
+> or use their own (system/conda) install. This doc is only for whoever publishes
+> the release assets.
+
+---
+
+## 1. How binary provisioning works
+
+raven-python does **not** vendor binaries in the git repo or on PyPI. Instead:
+
+1. For each tool we publish **version-pinned ZIPs as GitHub release assets**.
+2. A **registry** (`src/raven_python/binaries_registry.json`) maps each *bundle* to its
+   version, the executables it provides, and per-platform `{asset, sha256}`.
+3. At run time `raven_python.binaries.ensure_binary("blastp")` resolves a tool in this
+   order — and only reaches the download as a last resort:
+
+   ```
+   explicit binary= arg  →  env var (RAVEN_PYTHON_BLASTP / RAVEN_PYTHON_DIAMOND / …)
+     →  shutil.which on PATH (system / conda / apt / brew)
+     →  ensure_binary: download the pinned ZIP → verify SHA256 → cache → return path
+     →  actionable error (with conda / manual instructions)
+   ```
+
+So a pre-installed binary always wins; the bundle is the zero-setup fallback.
+Pinning the version makes reconstruction **reproducible**.
+
+A *bundle* can provide several executables from one download (e.g. the `blast`
+bundle provides both `blastp` and `makeblastdb`), so they are fetched once.
+
+---
+
+## 2. What raven-python actually needs — ship only these
+
+Distribute the **minimum** set of executables. Everything else (other suite
+tools, docs, examples, changelogs) must be excluded.
+
+| Bundle | Executables to include | Everything else |
+|---|---|---|
+| `diamond` | `diamond` | — (it is a single static binary) |
+| `blast` | `blastp`, `makeblastdb` | **drop** `blastn`, `tblastn`, `psiblast`, `rpsblast`, `blast_formatter`, `*_vdb`, the `doc/`, `ChangeLog`, `README`, ~30 other tools |
+
+(Confirmed against RAVEN `getBlast`/`getDiamond`: only `makeblastdb`+`blastp`, and
+`diamond` for its `makedb`/`blastp` subcommands, are ever invoked.)
+
+For BLAST+ this is the big win: the full NCBI suite is ~hundreds of MB; two
+binaries (stripped) are a small fraction.
+
+---
+
+## 3. Asset & ZIP conventions
+
+**Asset filename:** `<bundle>-<version>-<os>-<arch>.zip`
+
+- `<os>` ∈ `linux`, `macos`, `windows`
+- `<arch>` ∈ `x86_64`, `arm64`
+- examples: `diamond-2.1.11-linux-x86_64.zip`, `blast-2.16.0-macos-arm64.zip`
+
+**ZIP layout — flat, executables at the root, plus the upstream licence:**
+
+```
+diamond-2.1.11-linux-x86_64.zip
+├── diamond
+└── LICENSE
+
+blast-2.16.0-linux-x86_64.zip
+├── blastp
+├── makeblastdb
+└── LICENSE
+```
+
+No nested `bin/`, no extra files. `ensure_binary` extracts the ZIP into the cache
+and expects the executable at the top level.
+
+---
+
+## 4. Step-by-step: add or update a version
+
+Example: bump DIAMOND to a new version for Linux x86-64. Repeat per `(os, arch)`.
+
+1. **Download the official upstream build** (never rebuild from source unless you
+   must):
+   - DIAMOND → <https://github.com/bbuchfink/diamond/releases>
+     (`diamond-linux64.tar.gz`, `diamond-macos.tar.gz`)
+   - BLAST+ → <https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/> or a
+     pinned version dir (`ncbi-blast-<ver>+-x64-linux.tar.gz`,
+     `-x64-macosx.tar.gz`, `-aarch64-linux.tar.gz`, `-x64-win64.tar.gz`).
+   - Record the upstream URL **and** its published checksum for provenance.
+2. **Extract only the needed executables** (see §2) to a clean staging dir.
+3. **Strip debug symbols** to shrink (skip on Windows / signed macOS builds):
+   ```bash
+   strip diamond           # or: strip blastp makeblastdb
+   ```
+4. **Smoke-test the stripped binaries in a clean shell** (no other tools on PATH):
+   ```bash
+   ./diamond --version
+   ./blastp -version && ./makeblastdb -version
+   ```
+   If they fail for a missing shared library, add that `.so`/`.dylib` to the ZIP
+   (rare — NCBI/DIAMOND release builds are largely self-contained).
+5. **Add the upstream licence file** as `LICENSE` (see §6).
+6. **Zip with max compression, flat layout:**
+   ```bash
+   zip -9 -j diamond-2.1.11-linux-x86_64.zip diamond LICENSE
+   # -j junks paths so entries sit at the ZIP root
+   ```
+7. **Compute the SHA256:**
+   ```bash
+   sha256sum diamond-2.1.11-linux-x86_64.zip   # shasum -a 256 on macOS
+   ```
+8. **Attach the ZIP to a raven-python GitHub release** (a release tagged for the binary
+   set, e.g. `binaries-2024.06`, keeps them independent of code releases).
+9. **Update the registry** `src/raven_python/binaries_registry.json` — bump `version`
+   and set the per-platform `asset` + `sha256`:
+   ```json
+   {
+     "diamond": {
+       "version": "2.1.11",
+       "provides": ["diamond"],
+       "platforms": {
+         "linux-x86_64": {
+           "asset": "diamond-2.1.11-linux-x86_64.zip",
+           "url": "https://github.com/SysBioChalmers/raven-python/releases/download/binaries-2024.06/diamond-2.1.11-linux-x86_64.zip",
+           "sha256": "<sha256>"
+         }
+       }
+     },
+     "blast": {
+       "version": "2.16.0",
+       "provides": ["blastp", "makeblastdb"],
+       "platforms": { "linux-x86_64": { "asset": "...", "url": "...", "sha256": "..." } }
+     }
+   }
+   ```
+10. **Commit the registry change**, run the homology tests, and (if you have the
+    binary) confirm `ensure_binary("diamond", version="2.1.11")` downloads,
+    verifies, and runs.
+
+---
+
+## 5. Keeping the footprint minimal — checklist
+
+- ✅ Only the executables in §2 (for BLAST+, exactly `blastp` + `makeblastdb`).
+- ✅ `strip` the binaries (often halves their size).
+- ✅ `zip -9 -j` (max compression, flat — no `bin/`, no folders).
+- ✅ Exactly one extra file: `LICENSE`.
+- ❌ No docs, examples, `ChangeLog`, `README`, man pages, test data, or sibling tools.
+- ❌ No `.dSYM`/debug bundles; no duplicate static `.a` libraries.
+- ➕ Only add a shared library if step-4 testing proves it is required.
+
+---
+
+## 6. Platform / architecture matrix & licensing
+
+**Coverage = what you build.** Start with `linux-x86_64` (CI default), then add
+`macos-arm64`, `macos-x86_64`, `linux-arm64`, `windows-x86_64` as capacity allows.
+For any `(os, arch)` **not** in the registry, `ensure_binary` raises an actionable
+error pointing to conda (`conda install -c bioconda diamond blast`) or a manual
+install — that is the documented fallback, not a failure to fix urgently.
+
+**Licensing (must comply when redistributing):**
+
+- **BLAST+** — produced by NCBI (US Government); **public domain**, free to
+  redistribute. Include NCBI's `LICENSE` for courtesy/provenance.
+- **DIAMOND** — **GPLv3**. Redistribution is allowed; you **must** include the
+  GPLv3 licence text in the ZIP and keep the binary unmodified (or offer source).
+- **HMMER** (future) — BSD-3-Clause; include its `LICENSE`.
+
+Always ship the upstream licence in the ZIP, and keep a `BINARIES_PROVENANCE.md`
+(or a note in the release body) recording, per asset: upstream URL, upstream
+version, upstream checksum, and the SHA256 you published.
+
+### Native OS support per tool
+
+raven-python invokes each tool through `subprocess.run([resolved_path, …])` — that
+call is itself cross-platform, so the real constraint is whether a given tool has
+a binary that runs natively on each OS. It varies:
+
+| Tool | Linux | macOS (incl. arm64) | Windows (native) |
+|---|---|---|---|
+| BLAST+ (`blastp`, `makeblastdb`) | ✅ | ✅ | ✅ (NCBI ships Windows builds) |
+| DIAMOND | ✅ | ✅ | ⚠️ native build exists but Linux-first |
+| HMMER (`hmmbuild`/`hmmpress`/`hmmsearch`/`hmmscan`) | ✅ | ✅ | ❌ no official native build |
+| MAFFT | ✅ | ✅ | ⚠️ Windows package is a wrapper |
+| CD-HIT | ✅ | ✅ | ❌ no Windows build exists |
+
+Implications:
+
+- **Linux / macOS** — everything works. `conda install -c bioconda hmmer mafft
+  cd-hit blast diamond`, or point the `RAVEN_PYTHON_*` env vars at your installs.
+- **Native Windows** — the homology track (BLAST+/DIAMOND) works, but the **KEGG
+  HMM build (3b.3) and HMM query (3b.5) do not**: HMMER and CD-HIT have no Windows
+  binaries, and bioconda has no Windows packages for any of them. Bundling can't
+  fix this — there is no binary to bundle.
+- **Windows users should run raven-python inside WSL2** (or a Linux container), where
+  every tool is native Linux. raven-python does **not** replicate RAVEN's
+  `getWSLpath`/`wsl …` path translation: it calls the resolved binary directly, so
+  mixing native-Windows Python with WSL binaries is unsupported — keep the whole
+  stack inside WSL2.
+- The common end-user paths — homology reconstruction and the KEGG *species* model
+  (3b.4) — need no HMMER/MAFFT/CD-HIT, so they are fully cross-platform.
+
+---
+
+## 7. Emitting the registry entry
+
+After building the per-platform ZIPs (named `<bundle>-<version>-<os>-<arch>.zip`)
+and uploading them to the release, generate the `_REGISTRY` entry — checksums and
+URLs — with [`scripts/make_registry_snippet.py`](../scripts/README.md):
+
+```bash
+python scripts/make_registry_snippet.py binary --bundle blast --version 2.16.0 \
+    --provides blastp makeblastdb --dir zips \
+    --base-url https://github.com/ORG/raven-python/releases/download/blast-2.16.0
+```
+
+It prints the ready-to-paste `_REGISTRY["blast"]` block; its SHA256 helper is the
+same one `ensure_binary` verifies with, so the checksums always match. (Producing
+the minimal ZIPs themselves — download upstream, `strip`, `zip -9 -j`, add
+`LICENSE` per §3–§6 — is still a manual/per-tool step.)
+
+---
+
+## 8. Adding a new tool later (e.g. HMMER for KEGG reconstruction)
+
+1. Decide the **minimal executable set** (e.g. HMMER → `hmmsearch`, `hmmscan`,
+   maybe `hmmbuild`/`hmmpress`).
+2. Add a bundle entry to the registry with `provides` listing those executables.
+3. Build/attach ZIPs per §3–§4; include the tool's licence (§6).
+4. The wrappers call `ensure_binary("hmmsearch", …)` with the same resolution
+   order — no new provisioning code needed.
diff --git a/scripts/make_registry_snippet.py b/scripts/make_registry_snippet.py
new file mode 100644
index 0000000..3efa49e
--- /dev/null
+++ b/scripts/make_registry_snippet.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python
+"""Emit ready-to-paste registry entries for published artefacts / binary ZIPs.
+
+Computes the SHA256 of each file and prints the Python/JSON entry to merge into
+``raven_python.data._DATA_REGISTRY`` (data artefacts) or ``raven_python.binaries._REGISTRY``
+(binary bundles). Run once per release, after uploading the files to the release.
+
+Examples
+--------
+Data artefacts (KEGG reference model + tables + HMM libraries) for one release::
+
+    python scripts/make_registry_snippet.py data \\
+        --dataset kegg --version kegg116 --dir artefacts \\
+        --base-url https://github.com/ORG/raven_python/releases/download/kegg-data-kegg116
+
+Binary bundle (one ZIP per platform, named ``<bundle>-<version>-<os>-<arch>.zip``)::
+
+    python scripts/make_registry_snippet.py binary \\
+        --bundle blast --version 2.16.0 --provides blastp makeblastdb --dir zips \\
+        --base-url https://github.com/ORG/raven_python/releases/download/blast-2.16.0
+
+The SHA256 helper is shared with the runtime resolvers (``raven_python.binaries``), so
+published checksums always match what ``ensure_data`` / ``ensure_binary`` verify.
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+from raven_python.binaries import _sha256
+
+
+def _files_in(directory: Path) -> list[Path]:
+    """Regular, non-hidden files in ``directory``, sorted by name."""
+    return sorted(p for p in directory.iterdir() if p.is_file() and not p.name.startswith("."))
+
+
+def data_entry(dataset: str, version: str, base_url: str, directory: Path) -> dict:
+    """Build the ``_DATA_REGISTRY[dataset]`` entry for every file in ``directory``."""
+    base = base_url.rstrip("/")
+    files = {
+        p.name: {"url": f"{base}/{p.name}", "sha256": _sha256(p)} for p in _files_in(directory)
+    }
+    if not files:
+        raise SystemExit(f"No files found in {directory}")
+    return {"version": version, "files": files}
+
+
+def binary_entry(
+    bundle: str, version: str, provides: list[str], base_url: str, directory: Path
+) -> dict:
+    """Build the ``_REGISTRY[bundle]`` entry from ``<bundle>-<version>-<os>-<arch>.zip``."""
+    base = base_url.rstrip("/")
+    prefix = f"{bundle}-{version}-"
+    platforms = {}
+    for zip_path in directory.glob(f"{prefix}*.zip"):
+        platform = zip_path.name[len(prefix) : -len(".zip")]
+        platforms[platform] = {"url": f"{base}/{zip_path.name}", "sha256": _sha256(zip_path)}
+    if not platforms:
+        raise SystemExit(f"No {prefix}*.zip files found in {directory}")
+    return {"version": version, "provides": provides, "platforms": dict(sorted(platforms.items()))}
+
+
+def render(key: str, entry: dict) -> str:
+    """Render ``{key: entry}`` as an indented JSON block (valid Python to paste)."""
+    return json.dumps({key: entry}, indent=4)
+
+
+def main(argv: list[str] | None = None) -> None:
+    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    sub = parser.add_subparsers(dest="kind", required=True)
+
+    d = sub.add_parser("data", help="data-artefact registry entry (raven_python.data)")
+    d.add_argument("--dataset", required=True, help="dataset key, e.g. 'kegg'")
+    d.add_argument("--version", required=True)
+    d.add_argument("--dir", required=True, type=Path, help="directory of uploaded artefacts")
+    d.add_argument("--base-url", required=True, help="release download URL prefix")
+
+    b = sub.add_parser("binary", help="binary-bundle registry entry (raven_python.binaries)")
+    b.add_argument("--bundle", required=True, help="bundle key, e.g. 'blast'")
+    b.add_argument("--version", required=True)
+    b.add_argument("--provides", nargs="+", required=True, help="executables the bundle provides")
+    b.add_argument("--dir", required=True, type=Path, help="directory of uploaded ZIPs")
+    b.add_argument("--base-url", required=True, help="release download URL prefix")
+
+    args = parser.parse_args(argv)
+    if args.kind == "data":
+        key, entry = args.dataset, data_entry(args.dataset, args.version, args.base_url, args.dir)
+        target = "raven_python/data.py  _DATA_REGISTRY"
+    else:
+        key = args.bundle
+        entry = binary_entry(args.bundle, args.version, args.provides, args.base_url, args.dir)
+        target = "raven_python/binaries.py  _REGISTRY"
+
+    print(f"# Merge into {target}:", file=sys.stderr)
+    print(render(key, entry))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/raven_python/binaries.py b/src/raven_python/binaries.py
new file mode 100644
index 0000000..2d4b5a2
--- /dev/null
+++ b/src/raven_python/binaries.py
@@ -0,0 +1,148 @@
+"""Locate and provision external command-line binaries (BLAST+, DIAMOND, …).
+
+Shared across tools (not homology-specific). Resolution order for any executable:
+
+    explicit path arg  →  env var (RAVEN_PYTHON_<TOOL>)  →  shutil.which (PATH)
+      →  ensure_binary  (download the version-pinned ZIP from a raven_python release,
+                         verify SHA256, cache, return the path)
+      →  FileNotFoundError with install guidance
+
+So a pre-installed/conda binary always wins; the bundled ZIP is the zero-setup
+fallback. See docs/maintaining_binaries.md for how the release ZIPs and the
+registry are produced and updated.
+"""
+from __future__ import annotations
+
+import hashlib
+import os
+import platform
+import shutil
+import zipfile
+from pathlib import Path
+from urllib.request import urlopen
+
+# Registry of bundled binaries. Empty until release ZIPs are published; populated
+# per docs/maintaining_binaries.md. Keyed by *bundle*; one bundle can provide
+# several executables (e.g. "blast" -> blastp + makeblastdb).
+#   bundle -> {version, provides:[exe...], platforms:{"<os>-<arch>": {url, sha256}}}
+_REGISTRY: dict = {}
+
+# Environment variable overrides per executable.
+_ENV_VARS = {
+    "diamond": "RAVEN_PYTHON_DIAMOND",
+    "blastp": "RAVEN_PYTHON_BLASTP",
+    "makeblastdb": "RAVEN_PYTHON_MAKEBLASTDB",
+    "hmmbuild": "RAVEN_PYTHON_HMMBUILD",
+    "hmmpress": "RAVEN_PYTHON_HMMPRESS",
+    "hmmsearch": "RAVEN_PYTHON_HMMSEARCH",
+    "hmmscan": "RAVEN_PYTHON_HMMSCAN",
+    "mafft": "RAVEN_PYTHON_MAFFT",
+    "cd-hit": "RAVEN_PYTHON_CDHIT",
+}
+
+
+def platform_key() -> str:
+    """Return the ``<os>-<arch>`` key used in the registry (e.g. ``linux-x86_64``)."""
+    system = {"linux": "linux", "darwin": "macos", "windows": "windows"}.get(
+        platform.system().lower(), platform.system().lower()
+    )
+    machine = platform.machine().lower()
+    arch = {"x86_64": "x86_64", "amd64": "x86_64", "arm64": "arm64", "aarch64": "arm64"}.get(
+        machine, machine
+    )
+    return f"{system}-{arch}"
+
+
+def _cache_dir() -> Path:
+    base = os.environ.get("XDG_CACHE_HOME") or (Path.home() / ".cache")
+    return Path(base) / "raven_python" / "binaries"
+
+
+def _bundle_for(executable: str, registry: dict):
+    for name, bundle in registry.items():
+        if executable in bundle.get("provides", []):
+            return name, bundle
+    return None, None
+
+
+def _sha256(path: Path) -> str:
+    h = hashlib.sha256()
+    with open(path, "rb") as fh:
+        for chunk in iter(lambda: fh.read(1 << 20), b""):
+            h.update(chunk)
+    return h.hexdigest()
+
+
+def ensure_binary(executable: str, *, registry: dict | None = None) -> Path:
+    """Download (if needed) and return the path to a bundled ``executable``.
+
+    Consults the registry for the current platform, downloads the pinned ZIP,
+    verifies its SHA256, extracts it into the cache, and returns the executable
+    path. Raises ``FileNotFoundError`` if no bundle for this platform is hosted.
+    """
+    registry = _REGISTRY if registry is None else registry
+    bundle_name, bundle = _bundle_for(executable, registry)
+    if bundle is None:
+        raise FileNotFoundError(
+            f"No bundled binary registered for {executable!r}. Install it (e.g. "
+            f"`conda install -c bioconda {executable}`) or pass an explicit path."
+        )
+    key = platform_key()
+    entry = bundle.get("platforms", {}).get(key)
+    if entry is None:
+        raise FileNotFoundError(
+            f"No bundled {executable!r} for platform {key!r}. Install it "
+            f"(e.g. `conda install -c bioconda {executable}`), set "
+            f"{_ENV_VARS.get(executable, 'the binary path')}, or pass binary=."
+        )
+
+    dest_dir = _cache_dir() / f"{bundle_name}-{bundle['version']}-{key}"
+    exe = dest_dir / executable
+    if exe.exists():
+        return exe
+
+    dest_dir.mkdir(parents=True, exist_ok=True)
+    archive = dest_dir / "_download.zip"
+    # Download into a sibling .part file and rename on success — an interrupted
+    # download leaves the partial behind .part, never as a half-complete .zip
+    # that a later run might mistake for a finished one. Mirrors data.py.
+    part = archive.with_suffix(archive.suffix + ".part")
+    try:
+        with urlopen(entry["url"]) as resp, open(part, "wb") as out:  # noqa: S310
+            shutil.copyfileobj(resp, out)
+        digest = _sha256(part)
+        if digest != entry["sha256"]:
+            raise ValueError(
+                f"SHA256 mismatch for {executable!r} ({key}): "
+                f"expected {entry['sha256']}, got {digest}."
+            )
+        os.replace(part, archive)
+    finally:
+        part.unlink(missing_ok=True)
+    with zipfile.ZipFile(archive) as zf:
+        zf.extractall(dest_dir)
+    archive.unlink(missing_ok=True)
+    if not exe.exists():
+        raise FileNotFoundError(f"{executable!r} not found in the extracted bundle at {dest_dir}.")
+    exe.chmod(0o755)
+    return exe
+
+
+def resolve_binary(executable: str, *, binary: str | os.PathLike | None = None) -> str:
+    """Resolve an executable to a path: arg → env var → PATH → bundled ZIP → error."""
+    if binary is not None:
+        return os.fspath(binary)
+    env_var = _ENV_VARS.get(executable)
+    if env_var and os.environ.get(env_var):
+        return os.environ[env_var]
+    found = shutil.which(executable)
+    if found:
+        return found
+    try:
+        return os.fspath(ensure_binary(executable))
+    except FileNotFoundError as exc:
+        raise FileNotFoundError(
+            f"Could not find {executable!r}. Install it (e.g. "
+            f"`conda install -c bioconda {executable}`), put it on PATH, set "
+            f"{env_var or 'the binary path'}, or pass binary=. ({exc})"
+        ) from exc
diff --git a/src/raven_python/data.py b/src/raven_python/data.py
new file mode 100644
index 0000000..b1264be
--- /dev/null
+++ b/src/raven_python/data.py
@@ -0,0 +1,135 @@
+"""Fetch and cache published data artefacts (KEGG reference model, tables, HMMs).
+
+The mirror of :mod:`raven_python.binaries` for *data*: a version-pinned registry of
+downloadable artefacts, fetched on first use, SHA256-verified, and cached under
+platformdirs so end users never rebuild them from a KEGG dump (that is the
+maintainer's job — see docs/maintaining_kegg_data.md).
+
+Resolution for any artefact file:
+
+    explicit local dir  →  cached copy  →  download from the registry (verify,
+        cache)  →  FileNotFoundError with guidance
+
+The registry is **empty until the artefacts are published** (same as
+``binaries._REGISTRY``); until then ``ensure_data_file`` raises an actionable
+error. Cache layout::
+
+    $XDG_CACHE_HOME/raven_python/data/<dataset>-<version>/<filename>
+    (or ~/.cache/raven_python/data/... if XDG_CACHE_HOME is unset)
+"""
+from __future__ import annotations
+
+import os
+import shutil
+from pathlib import Path
+from urllib.request import urlopen
+
+from raven_python.binaries import _sha256
+
+# dataset -> {"version": str, "files": {filename: {"url": str, "sha256": str}}}
+# Populated when raven_python publishes the KEGG artefacts as release assets.
+_DATA_REGISTRY: dict = {}
+
+# The core KEGG artefacts needed to build a model (no HMM libraries).
+CORE_KEGG_FILES = (
+    "reference_model.yml.gz",
+    "ko_reaction.tsv.gz",
+    "ko_names.tsv.gz",
+    "organism_gene_ko.tsv.xz",
+    "rxn_flags.tsv.gz",
+)
+
+
+def _data_cache_dir() -> Path:
+    base = os.environ.get("XDG_CACHE_HOME") or (Path.home() / ".cache")
+    return Path(base) / "raven_python" / "data"
+
+
+def _bundle(dataset: str, registry: dict) -> dict:
+    bundle = registry.get(dataset)
+    if bundle is None:
+        raise FileNotFoundError(
+            f"No data artefacts registered for {dataset!r}. Either pass a local "
+            f"directory of artefacts, or build them per docs/maintaining_kegg_data.md."
+        )
+    return bundle
+
+
+def ensure_data_file(
+    dataset: str,
+    filename: str,
+    *,
+    version: str | None = None,
+    registry: dict | None = None,
+) -> Path:
+    """Download (if needed) and return the cached path to one artefact file.
+
+    Looks the file up in the registry for ``dataset`` (at ``version`` or the
+    registry's default), downloads it to the version-pinned cache directory,
+    verifies its SHA256, and returns the path. Re-uses an already-cached copy.
+    """
+    registry = _DATA_REGISTRY if registry is None else registry
+    bundle = _bundle(dataset, registry)
+    ver = version or bundle["version"]
+    entry = bundle.get("files", {}).get(filename)
+    if entry is None:
+        raise FileNotFoundError(
+            f"{filename!r} is not registered for {dataset!r} {ver}. "
+            f"Available: {sorted(bundle.get('files', {}))}."
+        )
+
+    dest_dir = _data_cache_dir() / f"{dataset}-{ver}"
+    dest = dest_dir / filename
+    if dest.exists():
+        return dest
+
+    dest_dir.mkdir(parents=True, exist_ok=True)
+    tmp = dest.with_name(dest.name + ".part")
+    with urlopen(entry["url"]) as resp, open(tmp, "wb") as out:  # noqa: S310 (trusted registry URLs)
+        shutil.copyfileobj(resp, out)
+    digest = _sha256(tmp)
+    if digest != entry["sha256"]:
+        tmp.unlink(missing_ok=True)
+        raise ValueError(
+            f"SHA256 mismatch for {dataset}/{filename} ({ver}): "
+            f"expected {entry['sha256']}, got {digest}."
+        )
+    tmp.replace(dest)
+    return dest
+
+
+def ensure_kegg_data(
+    *,
+    version: str | None = None,
+    files: tuple[str, ...] = CORE_KEGG_FILES,
+    registry: dict | None = None,
+) -> Path:
+    """Ensure the core KEGG artefacts are cached; return their directory.
+
+    Fetches each of ``files`` (default :data:`CORE_KEGG_FILES`) for the ``kegg``
+    dataset and returns the cache directory holding them — ready to pass as the
+    ``artefact_dir`` of :func:`get_kegg_model_for_organism_from_artefacts`.
+    """
+    registry = _DATA_REGISTRY if registry is None else registry
+    ver = version or _bundle("kegg", registry)["version"]
+    for filename in files:
+        ensure_data_file("kegg", filename, version=ver, registry=registry)
+    return _data_cache_dir() / f"kegg-{ver}"
+
+
+def ensure_kegg_hmm_library(
+    domain: str, *, version: str | None = None, registry: dict | None = None
+) -> Path:
+    """Ensure a domain HMM library (and its hmmpress index) is cached; return its path.
+
+    ``domain`` is ``"prokaryotes"`` or ``"eukaryotes"``. Fetches ``<domain>.hmm``
+    plus the ``hmmpress`` sidecar files (``.h3f/.h3i/.h3m/.h3p``) and returns the
+    path to the ``.hmm`` (the argument for :func:`run_hmmscan`).
+    """
+    registry = _DATA_REGISTRY if registry is None else registry
+    ver = version or _bundle("kegg", registry)["version"]
+    base = f"{domain}.hmm"
+    library = ensure_data_file("kegg", base, version=ver, registry=registry)
+    for suffix in (".h3f", ".h3i", ".h3m", ".h3p"):
+        ensure_data_file("kegg", base + suffix, version=ver, registry=registry)
+    return library
diff --git a/tests/test_binaries.py b/tests/test_binaries.py
new file mode 100644
index 0000000..d74ce0b
--- /dev/null
+++ b/tests/test_binaries.py
@@ -0,0 +1,80 @@
+"""Tests for raven_python.binaries (binary resolution + bundled-ZIP provisioning)."""
+import hashlib
+import shutil
+import zipfile
+from pathlib import Path
+
+import pytest
+
+from raven_python import binaries
+
+
+def test_resolve_explicit_path():
+    assert binaries.resolve_binary("blastp", binary="/opt/x/blastp") == "/opt/x/blastp"
+
+
+def test_resolve_env_var(monkeypatch):
+    monkeypatch.setenv("RAVEN_PYTHON_DIAMOND", "/custom/diamond")
+    assert binaries.resolve_binary("diamond") == "/custom/diamond"
+
+
+@pytest.mark.skipif(not shutil.which("blastp"), reason="blastp not installed")
+def test_resolve_via_path():
+    assert binaries.resolve_binary("blastp") == shutil.which("blastp")
+
+
+def test_resolve_unresolvable_raises(monkeypatch):
+    monkeypatch.setattr(shutil, "which", lambda _: None)
+    with pytest.raises(FileNotFoundError, match="Could not find"):
+        binaries.resolve_binary("diamond")  # empty registry, not on PATH
+
+
+def test_platform_key_format():
+    key = binaries.platform_key()
+    assert "-" in key
+    os_part, arch = key.split("-", 1)
+    assert os_part in {"linux", "macos", "windows"} or os_part  # tolerant
+
+
+def test_ensure_binary_downloads_verifies_extracts(tmp_path, monkeypatch):
+    # Build a fake bundle ZIP containing an executable, served via file:// URL.
+    exe = tmp_path / "footool"
+    exe.write_text("#!/bin/sh\necho hi\n")
+    archive = tmp_path / "footool.zip"
+    with zipfile.ZipFile(archive, "w") as zf:
+        zf.write(exe, "footool")
+    sha = hashlib.sha256(archive.read_bytes()).hexdigest()
+
+    registry = {
+        "footool": {
+            "version": "1.0",
+            "provides": ["footool"],
+            "platforms": {binaries.platform_key(): {"url": archive.as_uri(), "sha256": sha}},
+        }
+    }
+    monkeypatch.setenv("XDG_CACHE_HOME", str(tmp_path / "cache"))
+
+    path = binaries.ensure_binary("footool", registry=registry)
+    assert Path(path).exists()
+    assert Path(path).name == "footool"
+    # cached on second call (same path, no re-download needed)
+    assert binaries.ensure_binary("footool", registry=registry) == path
+
+
+def test_ensure_binary_sha_mismatch(tmp_path, monkeypatch):
+    archive = tmp_path / "x.zip"
+    with zipfile.ZipFile(archive, "w") as zf:
+        zf.writestr("footool", "data")
+    registry = {
+        "footool": {"version": "1", "provides": ["footool"],
+                    "platforms": {binaries.platform_key(): {"url": archive.as_uri(), "sha256": "deadbeef"}}}
+    }
+    monkeypatch.setenv("XDG_CACHE_HOME", str(tmp_path / "cache"))
+    with pytest.raises(ValueError, match="SHA256 mismatch"):
+        binaries.ensure_binary("footool", registry=registry)
+
+
+def test_ensure_binary_unhosted_platform_raises(tmp_path):
+    registry = {"footool": {"version": "1", "provides": ["footool"], "platforms": {}}}
+    with pytest.raises(FileNotFoundError, match="No bundled"):
+        binaries.ensure_binary("footool", registry=registry)
diff --git a/tests/test_data.py b/tests/test_data.py
new file mode 100644
index 0000000..714c3a9
--- /dev/null
+++ b/tests/test_data.py
@@ -0,0 +1,89 @@
+"""Tests for ensure_data (data.py). Uses file:// URLs to avoid the network."""
+import hashlib
+
+import pytest
+
+from raven_python.data import (
+    CORE_KEGG_FILES,
+    ensure_data_file,
+    ensure_kegg_data,
+)
+
+
+def _sha256(data: bytes) -> str:
+    return hashlib.sha256(data).hexdigest()
+
+
+@pytest.fixture
+def served(tmp_path, monkeypatch):
+    """A fake registry served from local files, with the cache pointed at tmp."""
+    src = tmp_path / "src"
+    src.mkdir()
+    payloads = {
+        "reference_model.yml.gz": b"!!omap model bytes",
+        "ko_reaction.tsv.gz": b"ko\treaction\n",
+        "ko_names.tsv.gz": b"ko\tname\n",
+        "organism_gene_ko.tsv.xz": b"organism\tgene\tko\n",
+        "rxn_flags.tsv.gz": b"reaction\tspontaneous\n",
+    }
+    files = {}
+    for name, data in payloads.items():
+        path = src / name
+        path.write_bytes(data)
+        files[name] = {"url": path.as_uri(), "sha256": _sha256(data)}
+    registry = {"kegg": {"version": "v1", "files": files}}
+
+    cache = tmp_path / "cache"
+    monkeypatch.setenv("XDG_CACHE_HOME", str(cache))
+    return registry, cache, payloads
+
+
+def test_ensure_data_file_downloads_and_caches(served):
+    registry, cache, payloads = served
+    path = ensure_data_file("kegg", "ko_reaction.tsv.gz", registry=registry)
+    assert path == cache / "raven_python" / "data" / "kegg-v1" / "ko_reaction.tsv.gz"
+    assert path.read_bytes() == payloads["ko_reaction.tsv.gz"]
+
+
+def test_ensure_data_file_reuses_cache(served, monkeypatch):
+    registry, _, _ = served
+    first = ensure_data_file("kegg", "ko_names.tsv.gz", registry=registry)
+    # Break the URL: a second call must hit the cache, not re-download.
+    registry["kegg"]["files"]["ko_names.tsv.gz"]["url"] = "file:///nonexistent"
+    second = ensure_data_file("kegg", "ko_names.tsv.gz", registry=registry)
+    assert first == second and second.exists()
+
+
+def test_sha256_mismatch_rejected(served):
+    registry, cache, _ = served
+    registry["kegg"]["files"]["rxn_flags.tsv.gz"]["sha256"] = "0" * 64
+    with pytest.raises(ValueError, match="SHA256 mismatch"):
+        ensure_data_file("kegg", "rxn_flags.tsv.gz", registry=registry)
+    # The corrupt partial download must not be left behind.
+    assert not (cache / "raven_python" / "data" / "kegg-v1" / "rxn_flags.tsv.gz").exists()
+
+
+def test_unknown_dataset_actionable_error(served):
+    registry, _, _ = served
+    with pytest.raises(FileNotFoundError, match="No data artefacts registered"):
+        ensure_data_file("metacyc", "x", registry=registry)
+
+
+def test_unknown_file_lists_available(served):
+    registry, _, _ = served
+    with pytest.raises(FileNotFoundError, match="not registered"):
+        ensure_data_file("kegg", "missing.tsv.gz", registry=registry)
+
+
+def test_ensure_kegg_data_fetches_core_set(served):
+    registry, cache, _ = served
+    out = ensure_kegg_data(registry=registry)
+    assert out == cache / "raven_python" / "data" / "kegg-v1"
+    for name in CORE_KEGG_FILES:
+        assert (out / name).is_file()
+
+
+def test_empty_registry_raises():
+    # The shipped registry is empty until artefacts are published.
+    with pytest.raises(FileNotFoundError, match="No data artefacts registered"):
+        ensure_data_file("kegg", "ko_reaction.tsv.gz")
diff --git a/tests/test_scripts_registry.py b/tests/test_scripts_registry.py
new file mode 100644
index 0000000..c9c03cf
--- /dev/null
+++ b/tests/test_scripts_registry.py
@@ -0,0 +1,58 @@
+"""Tests for scripts/make_registry_snippet.py registry-entry helpers."""
+import hashlib
+import importlib.util
+import json
+from pathlib import Path
+
+import pytest
+
+# scripts/ is not a package; load the module directly by path.
+_SCRIPT = Path(__file__).resolve().parents[1] / "scripts" / "make_registry_snippet.py"
+_spec = importlib.util.spec_from_file_location("make_registry_snippet", _SCRIPT)
+mrs = importlib.util.module_from_spec(_spec)
+_spec.loader.exec_module(mrs)
+
+
+def _sha(data: bytes) -> str:
+    return hashlib.sha256(data).hexdigest()
+
+
+def test_data_entry_lists_files_with_urls_and_checksums(tmp_path):
+    (tmp_path / "reference_model.yml.gz").write_bytes(b"model")
+    (tmp_path / "ko_reaction.tsv.gz").write_bytes(b"table")
+    (tmp_path / ".hidden").write_bytes(b"skip")  # hidden files ignored
+
+    entry = mrs.data_entry("kegg", "kegg116", "https://x/rel/", tmp_path)
+    assert entry["version"] == "kegg116"
+    assert set(entry["files"]) == {"reference_model.yml.gz", "ko_reaction.tsv.gz"}
+    ref = entry["files"]["reference_model.yml.gz"]
+    assert ref["url"] == "https://x/rel/reference_model.yml.gz"  # trailing slash collapsed
+    assert ref["sha256"] == _sha(b"model")
+
+
+def test_data_entry_empty_dir_errors(tmp_path):
+    with pytest.raises(SystemExit):
+        mrs.data_entry("kegg", "v1", "https://x", tmp_path)
+
+
+def test_binary_entry_parses_platform_from_filename(tmp_path):
+    (tmp_path / "blast-2.16.0-linux-x86_64.zip").write_bytes(b"linux")
+    (tmp_path / "blast-2.16.0-macos-arm64.zip").write_bytes(b"mac")
+    (tmp_path / "other-1.0-linux-x86_64.zip").write_bytes(b"nope")  # different bundle
+
+    entry = mrs.binary_entry("blast", "2.16.0", ["blastp", "makeblastdb"], "https://x", tmp_path)
+    assert entry["provides"] == ["blastp", "makeblastdb"]
+    assert set(entry["platforms"]) == {"linux-x86_64", "macos-arm64"}
+    assert entry["platforms"]["macos-arm64"]["sha256"] == _sha(b"mac")
+    assert entry["platforms"]["linux-x86_64"]["url"].endswith("blast-2.16.0-linux-x86_64.zip")
+
+
+def test_binary_entry_no_zips_errors(tmp_path):
+    with pytest.raises(SystemExit):
+        mrs.binary_entry("blast", "2.16.0", ["blastp"], "https://x", tmp_path)
+
+
+def test_render_is_valid_json_round_trip():
+    entry = {"version": "v1", "files": {"a": {"url": "u", "sha256": "s"}}}
+    text = mrs.render("kegg", entry)
+    assert json.loads(text) == {"kegg": entry}

From 6ef3357aeee811cde778174aa98887327f1e62f7 Mon Sep 17 00:00:00 2001
From: Eduard Kerkhoven <eduardk@chalmers.se>
Date: Fri, 29 May 2026 22:50:36 +0200
Subject: [PATCH 6/7] Add YAML and SIF model I/O

---
 src/raven_python/io/__init__.py |  15 +++
 src/raven_python/io/sif.py      |  96 +++++++++++++++
 src/raven_python/io/yaml.py     | 191 ++++++++++++++++++++++++++++++
 tests/test_io_sif.py            |  82 +++++++++++++
 tests/test_io_yaml.py           | 202 ++++++++++++++++++++++++++++++++
 5 files changed, 586 insertions(+)
 create mode 100644 src/raven_python/io/__init__.py
 create mode 100644 src/raven_python/io/sif.py
 create mode 100644 src/raven_python/io/yaml.py
 create mode 100644 tests/test_io_sif.py
 create mode 100644 tests/test_io_yaml.py

diff --git a/src/raven_python/io/__init__.py b/src/raven_python/io/__init__.py
new file mode 100644
index 0000000..bc70511
--- /dev/null
+++ b/src/raven_python/io/__init__.py
@@ -0,0 +1,15 @@
+"""RAVEN-specific I/O: YAML (cobra + Metabolic Atlas / Human-GEM extensions), SIF,
+Excel export, and the Standard-GEM ``model/<fmt>/…`` git layout.
+"""
+from raven_python.io.excel import export_to_excel
+from raven_python.io.git import export_for_git
+from raven_python.io.sif import export_model_to_sif
+from raven_python.io.yaml import read_yaml_model, write_yaml_model
+
+__all__ = [
+    "export_for_git",
+    "export_model_to_sif",
+    "export_to_excel",
+    "read_yaml_model",
+    "write_yaml_model",
+]
diff --git a/src/raven_python/io/sif.py b/src/raven_python/io/sif.py
new file mode 100644
index 0000000..9e73efa
--- /dev/null
+++ b/src/raven_python/io/sif.py
@@ -0,0 +1,96 @@
+"""Export a model to Cytoscape SIF (Simple Interaction Format).
+
+Three graph types are supported:
+
+* ``"rc"`` reaction–compound: each reaction linked to its metabolites;
+* ``"rr"`` reaction–reaction: reactions linked when they share a metabolite;
+* ``"cc"`` compound–compound: each substrate linked to the products of the
+  reactions it feeds (computed on an irreversible copy, as RAVEN does, to avoid
+  spurious double links from reversible reactions).
+
+A SIF line is ``source <tab> graph_type <tab> target1 <tab> target2 ...``.
+"""
+from __future__ import annotations
+
+import warnings
+from collections import Counter
+from collections.abc import Mapping
+from pathlib import Path
+
+import cobra
+
+from raven_python.manipulation.irreversible import convert_to_irreversible
+
+_GRAPH_TYPES = ("rc", "rr", "cc")
+
+
+def _edges(model, graph_type):
+    """Yield (source_object, [target_objects]) per the graph type."""
+    if graph_type == "rc":
+        for rxn in model.reactions:
+            yield rxn, list(rxn.metabolites)
+    elif graph_type == "rr":
+        for rxn in model.reactions:
+            neighbours = {r for met in rxn.metabolites for r in met.reactions}
+            neighbours.discard(rxn)
+            yield rxn, list(neighbours)
+    else:  # cc — on an irreversible copy
+        irrev = model.copy()
+        convert_to_irreversible(irrev)
+        for met in irrev.metabolites:
+            products: set = set()
+            for rxn in met.reactions:
+                if rxn.get_coefficient(met) < 0:  # met is a substrate here
+                    products.update(m for m, c in rxn.metabolites.items() if c > 0)
+            yield met, list(products)
+
+
+def export_model_to_sif(
+    model: cobra.Model,
+    path: str | Path,
+    graph_type: str = "rc",
+    *,
+    reaction_labels: Mapping[str, str] | None = None,
+    metabolite_labels: Mapping[str, str] | None = None,
+) -> None:
+    """Write ``model`` to a Cytoscape SIF file.
+
+    Parameters
+    ----------
+    graph_type
+        ``"rc"`` (reaction–compound, default), ``"rr"`` (reaction–reaction), or
+        ``"cc"`` (compound–compound).
+    reaction_labels, metabolite_labels
+        Optional ``{id: label}`` maps overriding the node labels (default: IDs).
+    """
+    if graph_type not in _GRAPH_TYPES:
+        raise ValueError(f"graph_type must be one of {_GRAPH_TYPES}, got {graph_type!r}")
+
+    rlabels = reaction_labels or {}
+    mlabels = metabolite_labels or {}
+
+    # Warn when the label maps collapse multiple distinct ids onto the same
+    # label: target-side dedup runs on labels, so the collision silently merges
+    # two nodes into one edge. Only check the ids actually mapped (cobra default
+    # labels are ids, which can't collide).
+    for kind, lmap in (("reaction", rlabels), ("metabolite", mlabels)):
+        duplicates = [lab for lab, n in Counter(lmap.values()).items() if n > 1]
+        if duplicates:
+            warnings.warn(
+                f"{kind}_labels maps multiple ids to the same label(s) "
+                f"({duplicates[:5]}{'…' if len(duplicates) > 5 else ''}); "
+                "SIF nodes are keyed by label, so those nodes will collapse.",
+                stacklevel=2,
+            )
+
+    def label(obj) -> str:
+        if isinstance(obj, cobra.Reaction):
+            return rlabels.get(obj.id, obj.id)
+        return mlabels.get(obj.id, obj.id)
+
+    with open(path, "w", encoding="utf-8") as handle:
+        for source, targets in _edges(model, graph_type):
+            src = label(source)
+            names = sorted({label(t) for t in targets} - {src})
+            if names:
+                handle.write(f"{src}\t{graph_type}\t" + "\t".join(names) + "\n")
diff --git a/src/raven_python/io/yaml.py b/src/raven_python/io/yaml.py
new file mode 100644
index 0000000..151954b
--- /dev/null
+++ b/src/raven_python/io/yaml.py
@@ -0,0 +1,191 @@
+"""Read and write RAVEN/cobrapy YAML models.
+
+Aligned to RAVEN ``writeYAMLmodel.m`` / ``readYAMLmodel.m`` as of the
+``feat/geckopy-compat-yaml`` work (commit fa281a1), whose writer emits **cobra's
+native ``!!omap`` YAML**. Because the format *is* cobra's, the standard model
+content — id, name, compartments, and per-entry id/name/compartment/formula/
+charge/bounds/gene_reaction_rule/objective_coefficient/subsystem/metabolites and
+the whole ``annotation`` block (which carries ``smiles`` for metabolites,
+``ec-code`` for reactions, and all MIRIAM cross-references) — is read and written
+by ``cobra.io`` directly.
+
+This module only handles what cobra drops or mishandles:
+
+* **RAVEN-only top-level per-entry keys** that cobra ignores: ``inchis``,
+  ``deltaG``, ``metFrom`` and the free-text ``notes`` (metNotes) on metabolites;
+  ``confidence_score``, ``references``, ``rxnFrom``, ``deltaG`` and ``notes``
+  (rxnNotes) on reactions; ``protein`` on genes. These are stashed in the cobra
+  object's ``.notes`` dict on read and lifted back to top-level keys on write.
+* **Model-level extras** cobra ignores: ``version``, the ``metaData`` provenance
+  block, and the GECKO sections (``gecko_light``/``ec-rxns``/``ec-enzymes``),
+  preserved on ``model.notes`` for round-tripping.
+
+The reader also accepts the older RAVEN files (id/name nested in ``metaData``).
+"""
+from __future__ import annotations
+
+import gzip
+from collections import OrderedDict
+from pathlib import Path
+
+import cobra
+from cobra.io.dict import model_from_dict, model_to_dict
+from cobra.io.yaml import yaml as _cobra_yaml  # ruamel round-trip YAML (handles !!omap)
+
+
+def _open_text(path: str | Path, mode: str):
+    """Open ``path`` as a text handle, transparently gzipping when it ends ``.gz``."""
+    if str(path).endswith(".gz"):
+        return gzip.open(path, f"{mode}t", encoding="utf-8")
+    return open(path, mode, encoding="utf-8")
+
+# RAVEN-only top-level per-entry keys -> the key used inside the cobra object's
+# .notes dict. ('notes' is RAVEN's free-text metNotes/rxnNotes; stored under
+# 'note' to avoid colliding with the notes container itself.)
+_MET_FIELDS = (("inchis", "inchis"), ("deltaG", "deltaG"), ("metFrom", "metFrom"), ("notes", "note"))
+_RXN_FIELDS = (
+    ("confidence_score", "confidence_score"),
+    ("references", "references"),
+    ("rxnFrom", "rxnFrom"),
+    ("deltaG", "deltaG"),
+    ("notes", "note"),
+)
+_GENE_FIELDS = (("protein", "protein"),)
+
+_COBRA_TOP_KEYS = frozenset({"metabolites", "reactions", "genes", "compartments", "id", "name"})
+
+
+def _to_plain(obj):
+    if isinstance(obj, dict):
+        return {str(k): _to_plain(v) for k, v in obj.items()}
+    if isinstance(obj, (list, tuple)):
+        return [_to_plain(v) for v in obj]
+    if isinstance(obj, bool) or obj is None:
+        return obj
+    if isinstance(obj, int):
+        return int(obj)
+    if isinstance(obj, float):
+        return float(obj)
+    return obj if isinstance(obj, str) else str(obj)
+
+
+def _capture_entry_fields(entries, fields):
+    """Pop RAVEN-only top-level keys off each entry into a parallel notes dict.
+
+    Returns a list of ``{notes_key: value}`` dicts aligned with ``entries`` (so
+    cobra never sees these keys), to be attached to the built objects afterwards.
+    """
+    captured = []
+    for entry in entries:
+        notes = {}
+        for yaml_key, notes_key in fields:
+            if yaml_key in entry:
+                notes[notes_key] = entry.pop(yaml_key)
+        captured.append(notes)
+    return captured
+
+
+def read_yaml_model(path: str | Path) -> cobra.Model:
+    """Read a RAVEN/cobrapy YAML model into a ``cobra.Model``."""
+    with _open_text(path, "r") as handle:
+        raw = _to_plain(_cobra_yaml.load(handle))
+
+    if not isinstance(raw, dict):
+        raise ValueError(f"{path}: top-level YAML is a {type(raw).__name__}, not a mapping.")
+
+    metadata = raw.pop("metaData", None) or {}
+    version = raw.pop("version", None)
+    foreign = {k: raw.pop(k) for k in list(raw) if k not in _COBRA_TOP_KEYS}
+
+    met_notes = _capture_entry_fields(raw.get("metabolites", []), _MET_FIELDS)
+    rxn_notes = _capture_entry_fields(raw.get("reactions", []), _RXN_FIELDS)
+    gene_notes = _capture_entry_fields(raw.get("genes", []), _GENE_FIELDS)
+
+    model = model_from_dict(raw)
+
+    for met, notes in zip(model.metabolites, met_notes, strict=False):
+        met.notes = notes
+    for rxn, notes in zip(model.reactions, rxn_notes, strict=False):
+        rxn.notes = notes
+    for gene, notes in zip(model.genes, gene_notes, strict=False):
+        gene.notes = notes
+
+    # Legacy files keep id/name inside metaData; restore them if cobra found none.
+    if metadata.get("id") and not model.id:
+        model.id = metadata["id"]
+    if metadata.get("name") and not model.name:
+        model.name = metadata["name"]
+    if metadata:
+        model.notes["metaData"] = metadata
+    if version is not None:
+        model.notes["version"] = version
+    if foreign:
+        model.notes["_yaml_sections"] = foreign
+
+    return model
+
+
+def _emit_entry_fields(entries, fields):
+    """Lift RAVEN-only keys out of each entry's ``notes`` dict to top level."""
+    for entry in entries:
+        notes = entry.pop("notes", None)
+        if not isinstance(notes, dict):
+            continue
+        notes = dict(notes)
+        for yaml_key, notes_key in fields:
+            if notes_key in notes:
+                entry[yaml_key] = notes.pop(notes_key)
+        # Preserve any remaining (non-RAVEN) notes. The RAVEN free-text note is lifted
+        # to the YAML key "notes"; if leftovers also exist, merge them with it under
+        # that key (rather than silently dropping the leftovers).
+        if notes:
+            if "notes" in entry:
+                notes["note"] = entry["notes"]
+            entry["notes"] = notes
+
+
+def write_yaml_model(
+    model: cobra.Model, path: str | Path, *, sort_ids: bool = False
+) -> None:
+    """Write a ``cobra.Model`` to RAVEN/cobrapy (``!!omap``) YAML.
+
+    With ``sort_ids=True`` metabolites/reactions/genes/compartments are written
+    in alphabetical order (diff-friendly), without modifying ``model``.
+    """
+    model_notes = dict(model.notes or {})
+    stored_meta = model_notes.pop("metaData", None) or {}
+    version = model_notes.pop("version", None)
+    foreign = model_notes.pop("_yaml_sections", None) or {}
+
+    doc = OrderedDict(_to_plain(model_to_dict(model)))
+
+    if sort_ids:
+        for section in ("metabolites", "reactions", "genes"):
+            if section in doc:
+                doc[section] = sorted(doc[section], key=lambda e: e.get("id", ""))
+        if isinstance(doc.get("compartments"), dict):
+            doc["compartments"] = dict(sorted(doc["compartments"].items()))
+
+    _emit_entry_fields(doc.get("metabolites", []), _MET_FIELDS)
+    _emit_entry_fields(doc.get("reactions", []), _RXN_FIELDS)
+    _emit_entry_fields(doc.get("genes", []), _GENE_FIELDS)
+
+    # cobra dict order is metabolites, reactions, genes, id, name, compartments;
+    # append version / gecko_light / metaData / ec-* like RAVEN's writer.
+    if version is not None:
+        doc["version"] = version
+    metadata = dict(stored_meta)
+    if model.id:
+        metadata.setdefault("id", model.id)
+    if model.name:
+        metadata.setdefault("name", model.name)
+    for key in ("gecko_light",):
+        if key in foreign:
+            doc[key] = foreign.pop(key)
+    if metadata:
+        doc["metaData"] = metadata
+    for key, value in foreign.items():
+        doc[key] = value
+
+    with _open_text(path, "w") as handle:
+        _cobra_yaml.dump(doc, handle)
diff --git a/tests/test_io_sif.py b/tests/test_io_sif.py
new file mode 100644
index 0000000..d50ad98
--- /dev/null
+++ b/tests/test_io_sif.py
@@ -0,0 +1,82 @@
+"""Tests for raven_python.io.sif (exportModelToSIF port)."""
+import cobra
+import pytest
+
+from raven_python.io import export_model_to_sif
+from raven_python.manipulation import add_reactions_from_equations
+
+
+@pytest.fixture
+def model():
+    m = cobra.Model("t")
+    m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b", "c")])
+    add_reactions_from_equations(
+        m,
+        [
+            {"id": "R1", "equation": "a --> b"},
+            {"id": "R2", "equation": "b --> c"},
+        ],
+    )
+    return m
+
+
+def _lines(path):
+    return [ln.split("\t") for ln in path.read_text().splitlines()]
+
+
+def test_reaction_compound(model, tmp_path):
+    out = tmp_path / "g.sif"
+    export_model_to_sif(model, out, "rc")
+    rows = {r[0]: (r[1], set(r[2:])) for r in _lines(out)}
+    assert rows["R1"] == ("rc", {"a", "b"})
+    assert rows["R2"] == ("rc", {"b", "c"})
+
+
+def test_reaction_reaction(model, tmp_path):
+    out = tmp_path / "g.sif"
+    export_model_to_sif(model, out, "rr")
+    rows = {r[0]: set(r[2:]) for r in _lines(out)}
+    # R1 and R2 share metabolite b
+    assert rows["R1"] == {"R2"}
+    assert rows["R2"] == {"R1"}
+
+
+def test_compound_compound(model, tmp_path):
+    out = tmp_path / "g.sif"
+    export_model_to_sif(model, out, "cc")
+    rows = {r[0]: set(r[2:]) for r in _lines(out)}
+    # a is a substrate of R1 (a->b): a links to product b
+    assert "b" in rows.get("a", set())
+    # b is substrate of R2 (b->c): b links to c
+    assert "c" in rows.get("b", set())
+
+
+def test_custom_labels(model, tmp_path):
+    out = tmp_path / "g.sif"
+    export_model_to_sif(model, out, "rc", reaction_labels={"R1": "Reaction1"})
+    sources = {r[0] for r in _lines(out)}
+    assert "Reaction1" in sources
+    assert "R1" not in sources
+
+
+def test_bad_graph_type(model, tmp_path):
+    with pytest.raises(ValueError, match="graph_type"):
+        export_model_to_sif(model, tmp_path / "g.sif", "xx")
+
+
+def test_cc_does_not_mutate_input(model, tmp_path):
+    n_before = len(model.reactions)
+    export_model_to_sif(model, tmp_path / "g.sif", "cc")
+    assert len(model.reactions) == n_before  # convert_to_irreversible ran on a copy
+
+
+# --- regression: label-map collision (known_issues.md B4) ------------------
+
+def test_collapsing_label_map_warns(model, tmp_path):
+    """A label map that sends two distinct ids to the same label silently merges
+    nodes during the target-side dedup. Now warns so the user sees it."""
+    with pytest.warns(UserWarning, match="multiple ids to the same label"):
+        export_model_to_sif(
+            model, tmp_path / "g.sif", "rc",
+            reaction_labels={"R1": "shared", "R2": "shared"},
+        )
diff --git a/tests/test_io_yaml.py b/tests/test_io_yaml.py
new file mode 100644
index 0000000..510af5f
--- /dev/null
+++ b/tests/test_io_yaml.py
@@ -0,0 +1,202 @@
+"""Tests for raven_python.io.yaml against the RAVEN fa281a1 (cobra-native !!omap) schema."""
+from pathlib import Path
+
+import cobra
+import pytest
+from cobra.io.yaml import yaml as cobra_yaml
+
+from raven_python.io import read_yaml_model, write_yaml_model
+
+# A model laid out exactly as RAVEN writeYAMLmodel (fa281a1) emits: cobra-native
+# structure, RAVEN-only fields as top-level per-entry keys, smiles/ec-code inside
+# the annotation block, metaData provenance-only, id/name/version top-level.
+RAVEN_DOC = {
+    "metabolites": [
+        {
+            "id": "s_0001",
+            "name": "ATP",
+            "compartment": "c",
+            "formula": "C10H16N5O13P3",
+            "charge": -4,
+            "inchis": "InChI=1S/CH4",
+            "deltaG": 12.5,
+            "notes": "a metabolite note",
+            "metFrom": "KEGG",
+            "annotation": {"kegg.compound": ["C00002"], "smiles": ["C1=NC2"]},
+        },
+        {"id": "s_0002", "name": "ADP", "compartment": "c"},
+    ],
+    "reactions": [
+        {
+            "id": "R1",
+            "name": "rxn one",
+            "metabolites": {"s_0001": -1, "s_0002": 1},
+            "lower_bound": -1000.0,
+            "upper_bound": 1000.0,
+            "gene_reaction_rule": "G1",
+            "subsystem": "glycolysis",
+            "confidence_score": 2,
+            "references": "PMID:123",
+            "rxnFrom": "manual",
+            "notes": "a reaction note",
+            "deltaG": -5.0,
+            "annotation": {"ec-code": ["1.1.1.1"]},
+        }
+    ],
+    "genes": [
+        {"id": "G1", "name": "gene one", "protein": "P12345", "annotation": {"uniprot": ["P12345"]}}
+    ],
+    "id": "testModel",
+    "name": "Test Model",
+    "compartments": {"c": "cytoplasm"},
+    "version": "1.0",
+    "metaData": {"date": "2026-05-23", "taxonomy": "taxonomy/559292", "defaultLB": "-1000"},
+    "ec-rxns": [{"id": "R1", "kcat": 100.0}],
+}
+
+
+@pytest.fixture
+def yaml_file(tmp_path) -> Path:
+    p = tmp_path / "model.yml"
+    with open(p, "w", encoding="utf-8") as fh:
+        cobra_yaml.dump(RAVEN_DOC, fh)
+    return p
+
+
+def test_standard_content(yaml_file):
+    model = read_yaml_model(yaml_file)
+    assert model.id == "testModel"
+    assert model.name == "Test Model"
+    assert {m.id for m in model.metabolites} == {"s_0001", "s_0002"}
+    r = model.reactions.get_by_id("R1")
+    assert r.bounds == (-1000.0, 1000.0)
+    assert r.subsystem == "glycolysis"
+    assert r.gene_reaction_rule == "G1"
+
+
+def test_annotation_owned_by_cobra(yaml_file):
+    # smiles / ec-code / miriam live in the annotation block (cobra reads them)
+    model = read_yaml_model(yaml_file)
+    assert model.metabolites.get_by_id("s_0001").annotation["smiles"] == ["C1=NC2"]
+    assert model.metabolites.get_by_id("s_0001").annotation["kegg.compound"] == ["C00002"]
+    assert model.reactions.get_by_id("R1").annotation["ec-code"] == ["1.1.1.1"]
+    assert model.genes.get_by_id("G1").annotation["uniprot"] == ["P12345"]
+
+
+def test_raven_only_fields_captured(yaml_file):
+    model = read_yaml_model(yaml_file)
+    a = model.metabolites.get_by_id("s_0001")
+    assert a.notes["inchis"] == "InChI=1S/CH4"
+    assert a.notes["deltaG"] == 12.5
+    assert a.notes["note"] == "a metabolite note"  # RAVEN metNotes string, no crash
+    assert a.notes["metFrom"] == "KEGG"
+    assert "smiles" not in a.notes  # smiles stays in annotation
+    r = model.reactions.get_by_id("R1")
+    assert r.notes["confidence_score"] == 2
+    assert r.notes["references"] == "PMID:123"
+    assert r.notes["rxnFrom"] == "manual"
+    assert r.notes["note"] == "a reaction note"
+    assert r.notes["deltaG"] == -5.0
+    assert model.genes.get_by_id("G1").notes["protein"] == "P12345"
+
+
+def test_model_level_extras(yaml_file):
+    model = read_yaml_model(yaml_file)
+    assert model.notes["metaData"]["taxonomy"] == "taxonomy/559292"
+    assert model.notes["version"] == "1.0"
+    assert model.notes["_yaml_sections"]["ec-rxns"][0]["kcat"] == 100.0
+
+
+def test_round_trip(yaml_file, tmp_path):
+    model = read_yaml_model(yaml_file)
+    out = tmp_path / "out.yml"
+    write_yaml_model(model, out)
+    reloaded = read_yaml_model(out)
+
+    assert reloaded.id == "testModel"
+    assert reloaded.notes["version"] == "1.0"
+    assert reloaded.notes["metaData"]["taxonomy"] == "taxonomy/559292"
+    a = reloaded.metabolites.get_by_id("s_0001")
+    assert a.notes["deltaG"] == 12.5
+    assert a.notes["note"] == "a metabolite note"
+    assert a.annotation["smiles"] == ["C1=NC2"]
+    r = reloaded.reactions.get_by_id("R1")
+    assert r.notes["confidence_score"] == 2
+    assert reloaded.genes.get_by_id("G1").notes["protein"] == "P12345"
+    assert reloaded.notes["_yaml_sections"]["ec-rxns"][0]["id"] == "R1"
+
+
+def test_extra_notes_not_dropped_when_free_text_note_present(yaml_file, tmp_path):
+    """An entry with both a RAVEN free-text note and an extra note keeps both on write."""
+    model = read_yaml_model(yaml_file)
+    a = model.metabolites.get_by_id("s_0001")
+    a.notes["note"] = "free text"
+    a.notes["custom"] = "extra value"  # a non-RAVEN note that must not be silently lost
+    out = tmp_path / "out.yml"
+    write_yaml_model(model, out)
+    text = out.read_text()
+    assert "extra value" in text  # the leftover note survives serialization
+
+
+def test_gzipped_round_trip(yaml_file, tmp_path):
+    # A .yml.gz path is transparently gzipped on write and read.
+    model = read_yaml_model(yaml_file)
+    out = tmp_path / "out.yml.gz"
+    write_yaml_model(model, out)
+    assert out.read_bytes()[:2] == b"\x1f\x8b"  # gzip magic
+    reloaded = read_yaml_model(out)
+    assert reloaded.id == "testModel"
+    assert {m.id for m in reloaded.metabolites} == {"s_0001", "s_0002"}
+
+
+def test_output_is_cobra_readable(yaml_file, tmp_path):
+    # The written file must load with stock cobra (it's cobra's native format).
+    model = read_yaml_model(yaml_file)
+    out = tmp_path / "out.yml"
+    write_yaml_model(model, out)
+    cobra_model = cobra.io.load_yaml_model(str(out))
+    assert cobra_model.id == "testModel"
+    assert {m.id for m in cobra_model.metabolites} == {"s_0001", "s_0002"}
+    # RAVEN-only fields land in cobra notes; smiles in annotation
+    assert cobra_model.metabolites.get_by_id("s_0001").annotation["smiles"] == ["C1=NC2"]
+
+
+def test_write_emits_raven_top_level_keys(yaml_file, tmp_path):
+    model = read_yaml_model(yaml_file)
+    out = tmp_path / "out.yml"
+    write_yaml_model(model, out)
+    text = out.read_text()
+    # RAVEN-only fields are lifted back to top-level entry keys, not buried in notes
+    assert "inchis:" in text
+    assert "deltaG:" in text
+    assert "confidence_score:" in text
+    assert "metaData:" in text
+
+
+def test_legacy_id_in_metadata(tmp_path):
+    # Older RAVEN files nest id/name under metaData and have no top-level id.
+    legacy = {
+        "metabolites": [{"id": "a_c", "name": "A", "compartment": "c"}],
+        "reactions": [],
+        "genes": [],
+        "compartments": {"c": "cyt"},
+        "metaData": {"id": "legacyModel", "name": "Legacy"},
+    }
+    p = tmp_path / "legacy.yml"
+    with open(p, "w", encoding="utf-8") as fh:
+        cobra_yaml.dump(legacy, fh)
+    model = read_yaml_model(p)
+    assert model.id == "legacyModel"
+    assert model.name == "Legacy"
+
+
+# Optional smoke test against a real model file if present.
+_YEAST = Path("/home/eduardk/github/GECKO/tutorials/full_ecModel/models/yeast-GEM.yml")
+
+
+@pytest.mark.skipif(not _YEAST.exists(), reason="real yeast-GEM.yml not available")
+def test_real_yeast_gem_loads():
+    model = read_yaml_model(_YEAST)
+    assert len(model.reactions) > 1000
+    # legacy file: identity comes from metaData
+    assert model.id

From 7a9b69accd9965c7f365e3d6a680fa2cc9fe293e Mon Sep 17 00:00:00 2001
From: Eduard Kerkhoven <eduardk@chalmers.se>
Date: Fri, 29 May 2026 22:51:36 +0200
Subject: [PATCH 7/7] Add Excel export and the Standard-GEM git-layout export

---
 src/raven_python/io/excel.py | 136 +++++++++++++++++++++++++++++++++++
 src/raven_python/io/git.py   | 106 +++++++++++++++++++++++++++
 tests/test_io_excel.py       | 111 ++++++++++++++++++++++++++++
 tests/test_io_git.py         |  69 ++++++++++++++++++
 4 files changed, 422 insertions(+)
 create mode 100644 src/raven_python/io/excel.py
 create mode 100644 src/raven_python/io/git.py
 create mode 100644 tests/test_io_excel.py
 create mode 100644 tests/test_io_git.py

diff --git a/src/raven_python/io/excel.py b/src/raven_python/io/excel.py
new file mode 100644
index 0000000..cf6196e
--- /dev/null
+++ b/src/raven_python/io/excel.py
@@ -0,0 +1,136 @@
+"""Export a model to the RAVEN Microsoft Excel format.
+
+Writes the five-sheet RAVEN xlsx layout — RXNS, METS, COMPS, GENES, MODEL — pulling
+RAVEN-specific values back out of cobra's ``annotation`` / ``notes`` (where the
+raven_python YAML reader stashes them). Excel *import* is intentionally not provided.
+
+Requires the optional ``openpyxl`` dependency (``pip install raven_python[excel]``).
+"""
+from __future__ import annotations
+
+from pathlib import Path
+
+import cobra
+
+
+def _miriam_string(annotation: dict, exclude: tuple[str, ...] = ()) -> str:
+    """RAVEN MIRIAM column: ``namespace/id;namespace/id2;...`` (sorted)."""
+    parts = []
+    for namespace in sorted(annotation):
+        if namespace in exclude:
+            continue
+        values = annotation[namespace]
+        if isinstance(values, str):
+            values = [values]
+        parts.extend(f"{namespace}/{value}" for value in values)
+    return ";".join(parts)
+
+
+def _equation(rxn: cobra.Reaction) -> str:
+    """Human-readable equation in RAVEN ``name[comp]`` form."""
+
+    def side(items):
+        return " + ".join(
+            f"{abs(coef):g} {met.name}[{met.compartment}]" for met, coef in items
+        )
+
+    reactants = [(m, c) for m, c in rxn.metabolites.items() if c < 0]
+    products = [(m, c) for m, c in rxn.metabolites.items() if c > 0]
+    arrow = " <=> " if rxn.reversibility else " => "
+    return f"{side(reactants)}{arrow}{side(products)}"
+
+
+def _ec_codes(rxn: cobra.Reaction) -> str:
+    codes = rxn.annotation.get("ec-code", [])
+    if isinstance(codes, str):
+        codes = [codes]
+    return ";".join(codes)
+
+
+def export_to_excel(
+    model: cobra.Model, path: str | Path, *, sort_ids: bool = False
+) -> None:
+    """Write ``model`` to a RAVEN-format ``.xlsx`` file.
+
+    Parameters
+    ----------
+    sort_ids
+        If True, write reactions/metabolites/genes sorted alphabetically by ID
+        (the model itself is not modified).
+    """
+    try:
+        from openpyxl import Workbook
+    except ImportError as exc:  # pragma: no cover - exercised only without openpyxl
+        raise ImportError(
+            "export_to_excel requires openpyxl. Install it with "
+            "`pip install raven_python[excel]` (or `pip install openpyxl`)."
+        ) from exc
+
+    reactions = sorted(model.reactions, key=lambda r: r.id) if sort_ids else list(model.reactions)
+    metabolites = (
+        sorted(model.metabolites, key=lambda m: m.id) if sort_ids else list(model.metabolites)
+    )
+    genes = sorted(model.genes, key=lambda g: g.id) if sort_ids else list(model.genes)
+    metadata = dict(model.notes.get("metaData", {})) if model.notes else {}
+
+    wb = Workbook()
+    wb.remove(wb.active)  # drop the default empty sheet
+
+    # --- RXNS ---
+    ws = wb.create_sheet("RXNS")
+    ws.append(
+        ["#", "ID", "NAME", "EQUATION", "EC-NUMBER", "GENE ASSOCIATION", "LOWER BOUND",
+         "UPPER BOUND", "OBJECTIVE", "COMPARTMENT", "MIRIAM", "SUBSYSTEM",
+         "REPLACEMENT ID", "NOTE", "REFERENCE", "CONFIDENCE SCORE"]
+    )
+    for r in reactions:
+        subsystem = r.subsystem
+        if isinstance(subsystem, (list, tuple)):
+            subsystem = ";".join(subsystem)
+        ws.append([
+            None, r.id, r.name, _equation(r), _ec_codes(r), r.gene_reaction_rule,
+            r.lower_bound, r.upper_bound,
+            r.objective_coefficient or None, None,
+            _miriam_string(r.annotation, exclude=("ec-code",)), subsystem, None,
+            r.notes.get("note"), r.notes.get("references"), r.notes.get("confidence_score"),
+        ])
+
+    # --- METS ---
+    ws = wb.create_sheet("METS")
+    ws.append(["#", "ID", "NAME", "UNCONSTRAINED", "MIRIAM", "COMPOSITION", "InChI",
+               "COMPARTMENT", "REPLACEMENT ID", "CHARGE"])
+    for m in metabolites:
+        inchi = m.notes.get("inchis")
+        ws.append([
+            None, f"{m.name}[{m.compartment}]", m.name, None,
+            _miriam_string(m.annotation, exclude=("smiles",)),
+            None if inchi else m.formula, inchi, m.compartment, m.id, m.charge,
+        ])
+
+    # --- COMPS ---
+    ws = wb.create_sheet("COMPS")
+    ws.append(["#", "ABBREVIATION", "NAME", "INSIDE", "MIRIAM"])
+    comps = sorted(model.compartments) if sort_ids else list(model.compartments)
+    for cid in comps:
+        ws.append([None, cid, model.compartments.get(cid, ""), None, None])
+
+    # --- GENES ---
+    if genes:
+        ws = wb.create_sheet("GENES")
+        ws.append(["#", "NAME", "MIRIAM", "SHORT NAME", "COMPARTMENT"])
+        for g in genes:
+            ws.append([None, g.id, _miriam_string(g.annotation), g.name, None])
+
+    # --- MODEL ---
+    ws = wb.create_sheet("MODEL")
+    ws.append(["#", "ID", "NAME", "TAXONOMY", "DEFAULT LOWER", "DEFAULT UPPER",
+               "CONTACT GIVEN NAME", "CONTACT FAMILY NAME", "CONTACT EMAIL",
+               "ORGANIZATION", "NOTES"])
+    ws.append([
+        None, model.id or "blankID", model.name or "blankName",
+        metadata.get("taxonomy"), metadata.get("defaultLB"), metadata.get("defaultUB"),
+        metadata.get("givenName"), metadata.get("familyName"), metadata.get("email"),
+        metadata.get("organization"), metadata.get("note"),
+    ])
+
+    wb.save(str(path))
diff --git a/src/raven_python/io/git.py b/src/raven_python/io/git.py
new file mode 100644
index 0000000..80bf8e8
--- /dev/null
+++ b/src/raven_python/io/git.py
@@ -0,0 +1,106 @@
+"""Export a model into a Standard-GEM versioned-repository layout.
+
+Writes the model in several formats into the Standard-GEM folder structure (a
+``model/`` directory with one subfolder per format), ready to commit to a
+Git-maintained model repository (Metabolic Atlas / Human-GEM / yeast-GEM style),
+plus a ``dependencies.txt`` recording tool versions.
+
+Thin orchestration over the writers raven_python already exposes: ``write_yaml_model``,
+cobra's ``write_sbml_model`` and ``save_matlab_model``, ``export_to_excel``, plus a
+single-file reaction table (txt).
+"""
+from __future__ import annotations
+
+import importlib.metadata as _md
+import platform
+from collections.abc import Iterable
+from pathlib import Path
+
+import cobra
+
+from raven_python.io.excel import _equation, export_to_excel
+from raven_python.io.yaml import write_yaml_model
+from raven_python.utils.sort import sort_identifiers
+
+_ALL_FORMATS = ("yml", "xml", "mat", "xlsx", "txt")
+
+
+def _version(package: str) -> str:
+    try:
+        return _md.version(package)
+    except _md.PackageNotFoundError:
+        return "unknown"
+
+
+def _write_txt(model: cobra.Model, path: Path) -> None:
+    """Single-file, human-readable reaction table (RAVEN exportForGit txt)."""
+    with open(path, "w", encoding="utf-8") as fh:
+        fh.write("Rxn name\tFormula\tGene-reaction association\tLB\tUB\tObjective\n")
+        for r in model.reactions:
+            fh.write(
+                f"{r.id}\t{_equation(r)}\t{r.gene_reaction_rule}\t"
+                f"{r.lower_bound:g}\t{r.upper_bound:g}\t{r.objective_coefficient:g}\n"
+            )
+
+
+def export_for_git(
+    model: cobra.Model,
+    path: str | Path = ".",
+    *,
+    prefix: str = "model",
+    formats: Iterable[str] = ("yml", "xml", "mat", "xlsx"),
+    sub_dirs: bool = True,
+) -> Path:
+    """Write ``model`` into a Standard-GEM repository layout.
+
+    Parameters
+    ----------
+    path
+        Directory to populate.
+    prefix
+        Base filename for every format (default ``"model"``).
+    formats
+        Which formats to write; any of ``"yml"``, ``"xml"``, ``"mat"``,
+        ``"xlsx"``, ``"txt"`` (default ``yml``/``xml``/``mat``/``xlsx``).
+    sub_dirs
+        If True (default), write ``model/<fmt>/<prefix>.<fmt>`` (standard-GEM
+        layout); otherwise all files go directly in ``path``.
+
+    Returns
+    -------
+    pathlib.Path
+        The root directory written to.
+    """
+    formats = list(formats)
+    unknown = set(formats) - set(_ALL_FORMATS)
+    if unknown:
+        raise ValueError(f"Unknown format(s): {sorted(unknown)}; allowed: {_ALL_FORMATS}")
+
+    # Sort a copy so the caller's model is untouched.
+    model = sort_identifiers(model.copy())
+
+    root = Path(path) / "model" if sub_dirs else Path(path)
+    root.mkdir(parents=True, exist_ok=True)
+
+    def target(fmt: str) -> Path:
+        folder = root / fmt if sub_dirs else root
+        folder.mkdir(parents=True, exist_ok=True)
+        return folder / f"{prefix}.{fmt}"
+
+    if "yml" in formats:
+        write_yaml_model(model, target("yml"))
+    if "xml" in formats:
+        cobra.io.write_sbml_model(model, str(target("xml")))
+    if "mat" in formats:
+        cobra.io.save_matlab_model(model, str(target("mat")))
+    if "xlsx" in formats:
+        export_to_excel(model, target("xlsx"))
+    if "txt" in formats:
+        _write_txt(model, target("txt"))
+
+    with open(root / "dependencies.txt", "w", encoding="utf-8") as fh:
+        fh.write(f"python\t{platform.python_version()}\n")
+        fh.write(f"cobra\t{_version('cobra')}\n")
+        fh.write(f"raven_python\t{_version('raven_python')}\n")
+
+    return root
diff --git a/tests/test_io_excel.py b/tests/test_io_excel.py
new file mode 100644
index 0000000..12434ef
--- /dev/null
+++ b/tests/test_io_excel.py
@@ -0,0 +1,111 @@
+"""Tests for raven_python.io.excel (exportToExcelFormat port, export only)."""
+import cobra
+import pytest
+
+openpyxl = pytest.importorskip("openpyxl")
+
+from raven_python.io import export_to_excel
+from raven_python.manipulation import add_reactions_from_equations
+
+
+@pytest.fixture
+def model():
+    m = cobra.Model("yeastGEM")
+    m.name = "Yeast"
+    m.compartments = {"c": "cytoplasm"}
+    m.notes["metaData"] = {"taxonomy": "taxonomy/559292", "defaultLB": "-1000"}
+    m.add_metabolites(
+        [
+            cobra.Metabolite("atp_c", name="ATP", formula="C10H16N5O13P3", charge=-4, compartment="c"),
+            cobra.Metabolite("adp_c", name="ADP", compartment="c"),
+        ]
+    )
+    m.metabolites.atp_c.annotation = {"kegg.compound": ["C00002"], "smiles": ["C1=NC"]}
+    m.metabolites.atp_c.notes = {"inchis": "InChI=1S/X"}
+    add_reactions_from_equations(
+        m,
+        [{"id": "R1", "equation": "atp_c <=> adp_c", "name": "rxn one",
+          "gene_reaction_rule": "G1", "subsystem": "glycolysis"}],
+    )
+    r = m.reactions.R1
+    r.annotation = {"ec-code": ["1.1.1.1"], "kegg.reaction": ["R00001"]}
+    r.notes = {"confidence_score": 2, "note": "a note", "references": "PMID:1"}
+    r.objective_coefficient = 1
+    return m
+
+
+def _wb(path):
+    return openpyxl.load_workbook(path)
+
+
+def test_sheets_present(model, tmp_path):
+    out = tmp_path / "m.xlsx"
+    export_to_excel(model, out)
+    wb = _wb(out)
+    assert set(wb.sheetnames) == {"RXNS", "METS", "COMPS", "GENES", "MODEL"}
+
+
+def test_rxns_sheet(model, tmp_path):
+    out = tmp_path / "m.xlsx"
+    export_to_excel(model, out)
+    ws = _wb(out)["RXNS"]
+    header = [c.value for c in ws[1]]
+    row = {header[i]: c.value for i, c in enumerate(ws[2])}
+    assert row["ID"] == "R1"
+    assert row["NAME"] == "rxn one"
+    assert "ATP[c]" in row["EQUATION"] and "<=>" in row["EQUATION"]
+    assert row["EC-NUMBER"] == "1.1.1.1"
+    assert row["GENE ASSOCIATION"] == "G1"
+    assert row["SUBSYSTEM"] == "glycolysis"
+    assert row["OBJECTIVE"] == 1
+    assert row["CONFIDENCE SCORE"] == 2
+    assert row["NOTE"] == "a note"
+    assert row["MIRIAM"] == "kegg.reaction/R00001"  # ec-code excluded (own column)
+
+
+def test_mets_sheet(model, tmp_path):
+    out = tmp_path / "m.xlsx"
+    export_to_excel(model, out)
+    ws = _wb(out)["METS"]
+    header = [c.value for c in ws[1]]
+    rows = {
+        r[header.index("REPLACEMENT ID")].value: {header[i]: c.value for i, c in enumerate(r)}
+        for r in ws.iter_rows(min_row=2)
+    }
+    atp = rows["atp_c"]
+    assert atp["ID"] == "ATP[c]"
+    assert atp["NAME"] == "ATP"
+    assert atp["InChI"] == "InChI=1S/X"
+    assert atp["COMPOSITION"] is None  # suppressed when InChI present
+    assert atp["CHARGE"] == -4
+    assert atp["MIRIAM"] == "kegg.compound/C00002"  # smiles excluded
+
+
+def test_model_sheet(model, tmp_path):
+    out = tmp_path / "m.xlsx"
+    export_to_excel(model, out)
+    ws = _wb(out)["MODEL"]
+    header = [c.value for c in ws[1]]
+    row = {header[i]: c.value for i, c in enumerate(ws[2])}
+    assert row["ID"] == "yeastGEM"
+    assert row["NAME"] == "Yeast"
+    assert row["TAXONOMY"] == "taxonomy/559292"
+    assert row["DEFAULT LOWER"] == "-1000"
+
+
+def test_genes_sheet(model, tmp_path):
+    out = tmp_path / "m.xlsx"
+    export_to_excel(model, out)
+    ws = _wb(out)["GENES"]
+    header = [c.value for c in ws[1]]
+    row = {header[i]: c.value for i, c in enumerate(ws[2])}
+    assert row["NAME"] == "G1"
+
+
+def test_no_genes_skips_sheet(tmp_path):
+    m = cobra.Model("t")
+    m.add_metabolites([cobra.Metabolite("a_c", compartment="c")])
+    add_reactions_from_equations(m, [{"id": "R1", "equation": "a_c -->"}])
+    out = tmp_path / "m.xlsx"
+    export_to_excel(m, out)
+    assert "GENES" not in _wb(out).sheetnames
diff --git a/tests/test_io_git.py b/tests/test_io_git.py
new file mode 100644
index 0000000..28881dc
--- /dev/null
+++ b/tests/test_io_git.py
@@ -0,0 +1,69 @@
+"""Tests for raven_python.io.git (exportForGit port)."""
+import cobra
+import pytest
+
+from raven_python.io import export_for_git
+from raven_python.manipulation import add_reactions_from_equations
+
+
+@pytest.fixture
+def model():
+    m = cobra.Model("yeastGEM")
+    m.compartments = {"c": "cytoplasm"}
+    m.add_metabolites(
+        [cobra.Metabolite("atp_c", name="ATP", compartment="c"),
+         cobra.Metabolite("adp_c", name="ADP", compartment="c")]
+    )
+    add_reactions_from_equations(m, [{"id": "R1", "equation": "atp_c <=> adp_c"}])
+    return m
+
+
+def test_standard_gem_layout(model, tmp_path):
+    root = export_for_git(model, tmp_path, prefix="yeast", formats=("yml", "xml", "mat", "xlsx", "txt"))
+    assert root == tmp_path / "model"
+    assert (root / "yml" / "yeast.yml").exists()
+    assert (root / "xml" / "yeast.xml").exists()
+    assert (root / "mat" / "yeast.mat").exists()
+    assert (root / "xlsx" / "yeast.xlsx").exists()
+    assert (root / "txt" / "yeast.txt").exists()
+    assert (root / "dependencies.txt").exists()
+
+
+def test_dependencies_file(model, tmp_path):
+    root = export_for_git(model, tmp_path, formats=("yml",))
+    deps = (root / "dependencies.txt").read_text()
+    assert "python\t" in deps
+    assert "cobra\t" in deps
+    assert "raven_python\t" in deps
+
+
+def test_flat_layout(model, tmp_path):
+    root = export_for_git(model, tmp_path, formats=("yml",), sub_dirs=False)
+    assert root == tmp_path
+    assert (tmp_path / "model.yml").exists()
+
+
+def test_subset_of_formats(model, tmp_path):
+    root = export_for_git(model, tmp_path, formats=("yml", "xml"))
+    assert (root / "yml" / "model.yml").exists()
+    assert not (root / "mat").exists()
+    assert not (root / "xlsx").exists()
+
+
+def test_does_not_mutate_model(model, tmp_path):
+    order_before = [r.id for r in model.reactions]
+    export_for_git(model, tmp_path, formats=("yml",))
+    assert [r.id for r in model.reactions] == order_before
+
+
+def test_txt_table_content(model, tmp_path):
+    root = export_for_git(model, tmp_path, formats=("txt",))
+    txt = (root / "txt" / "model.txt").read_text()
+    assert txt.splitlines()[0].startswith("Rxn name\t")
+    assert "R1" in txt
+    assert "ATP[c]" in txt
+
+
+def test_bad_format(model, tmp_path):
+    with pytest.raises(ValueError, match="Unknown format"):
+        export_for_git(model, tmp_path, formats=("yml", "json"))