diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..3f6d19b
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,48 @@
+name: CI
+
+on:
+ push:
+ branches: [main]
+ pull_request:
+ workflow_dispatch:
+
+# A push that obsoletes a previous run cancels it.
+concurrency:
+ group: ci-${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ lint:
+ name: ruff
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+ - uses: actions/setup-python@v5
+ with:
+ python-version: "3.12"
+ cache: pip
+ cache-dependency-path: pyproject.toml
+ - run: pip install --upgrade pip
+ - run: pip install ruff
+ - run: ruff check .
+
+ test:
+ name: pytest (py${{ matrix.python }})
+ runs-on: ubuntu-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ python: ["3.11", "3.12", "3.13"]
+ steps:
+ - uses: actions/checkout@v4
+ - uses: actions/setup-python@v5
+ with:
+ python-version: ${{ matrix.python }}
+ cache: pip
+ cache-dependency-path: pyproject.toml
+ - run: pip install --upgrade pip
+ # ``-e .[dev,plotting,excel]`` so every optional extra is exercised.
+ # Gurobi is not installable on free runners; the relevant tests
+ # skip themselves when ``optlang.gurobi_interface`` cannot import.
+ - run: pip install -e ".[dev,plotting,excel]"
+ - run: pytest -q --maxfail=5 --durations=20
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..1f44cf3
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,541 @@
+GNU GENERAL PUBLIC LICENSE
+==========================
+
+Version 3, 29 June 2007
+
+Copyright © 2007 Free Software Foundation, Inc. <>
+
+Everyone is permitted to copy and distribute verbatim copies of this license
+document, but changing it is not allowed.
+
+## Preamble
+
+The GNU General Public License is a free, copyleft license for software and other
+kinds of works.
+
+The licenses for most software and other practical works are designed to take away
+your freedom to share and change the works. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change all versions of a
+program--to make sure it remains free software for all its users. We, the Free
+Software Foundation, use the GNU General Public License for most of our software; it
+applies also to any other work released this way by its authors. You can apply it to
+your programs, too.
+
+When we speak of free software, we are referring to freedom, not price. Our General
+Public Licenses are designed to make sure that you have the freedom to distribute
+copies of free software (and charge for them if you wish), that you receive source
+code or can get it if you want it, that you can change the software or use pieces of
+it in new free programs, and that you know you can do these things.
+
+To protect your rights, we need to prevent others from denying you these rights or
+asking you to surrender the rights. Therefore, you have certain responsibilities if
+you distribute copies of the software, or if you modify it: responsibilities to
+respect the freedom of others.
+
+For example, if you distribute copies of such a program, whether gratis or for a fee,
+you must pass on to the recipients the same freedoms that you received. You must make
+sure that they, too, receive or can get the source code. And you must show them these
+terms so they know their rights.
+
+Developers that use the GNU GPL protect your rights with two steps: (1) assert
+copyright on the software, and (2) offer you this License giving you legal permission
+to copy, distribute and/or modify it.
+
+For the developers' and authors' protection, the GPL clearly explains that there is
+no warranty for this free software. For both users' and authors' sake, the GPL
+requires that modified versions be marked as changed, so that their problems will not
+be attributed erroneously to authors of previous versions.
+
+Some devices are designed to deny users access to install or run modified versions of
+the software inside them, although the manufacturer can do so. This is fundamentally
+incompatible with the aim of protecting users' freedom to change the software. The
+systematic pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable. Therefore, we have designed
+this version of the GPL to prohibit the practice for those products. If such problems
+arise substantially in other domains, we stand ready to extend this provision to
+those domains in future versions of the GPL, as needed to protect the freedom of
+users.
+
+Finally, every program is threatened constantly by software patents. States should
+not allow patents to restrict development and use of software on general-purpose
+computers, but in those that do, we wish to avoid the special danger that patents
+applied to a free program could make it effectively proprietary. To prevent this, the
+GPL assures that patents cannot be used to render the program non-free.
+
+The precise terms and conditions for copying, distribution and modification follow.
+
+## TERMS AND CONDITIONS
+
+### 0. Definitions.
+
+“This License” refers to version 3 of the GNU General Public License.
+
+“Copyright” also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+“The Program” refers to any copyrightable work licensed under this
+License. Each licensee is addressed as “you”. “Licensees” and
+“recipients” may be individuals or organizations.
+
+To “modify” a work means to copy from or adapt all or part of the work in
+a fashion requiring copyright permission, other than the making of an exact copy. The
+resulting work is called a “modified version” of the earlier work or a
+work “based on” the earlier work.
+
+A “covered work” means either the unmodified Program or a work based on
+the Program.
+
+To “propagate” a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for infringement under
+applicable copyright law, except executing it on a computer or modifying a private
+copy. Propagation includes copying, distribution (with or without modification),
+making available to the public, and in some countries other activities as well.
+
+To “convey” a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through a computer
+network, with no transfer of a copy, is not conveying.
+
+An interactive user interface displays “Appropriate Legal Notices” to the
+extent that it includes a convenient and prominently visible feature that (1)
+displays an appropriate copyright notice, and (2) tells the user that there is no
+warranty for the work (except to the extent that warranties are provided), that
+licensees may convey the work under this License, and how to view a copy of this
+License. If the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+### 1. Source Code.
+
+The “source code” for a work means the preferred form of the work for
+making modifications to it. “Object code” means any non-source form of a
+work.
+
+A “Standard Interface” means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of interfaces
+specified for a particular programming language, one that is widely used among
+developers working in that language.
+
+The “System Libraries” of an executable work include anything, other than
+the work as a whole, that (a) is included in the normal form of packaging a Major
+Component, but which is not part of that Major Component, and (b) serves only to
+enable use of the work with that Major Component, or to implement a Standard
+Interface for which an implementation is available to the public in source code form.
+A “Major Component”, in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system (if any) on which
+the executable work runs, or a compiler used to produce the work, or an object code
+interpreter used to run it.
+
+The “Corresponding Source” for a work in object code form means all the
+source code needed to generate, install, and (for an executable work) run the object
+code and to modify the work, including scripts to control those activities. However,
+it does not include the work's System Libraries, or general-purpose tools or
+generally available free programs which are used unmodified in performing those
+activities but which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for the work, and
+the source code for shared libraries and dynamically linked subprograms that the work
+is specifically designed to require, such as by intimate data communication or
+control flow between those subprograms and other parts of the work.
+
+The Corresponding Source need not include anything that users can regenerate
+automatically from other parts of the Corresponding Source.
+
+The Corresponding Source for a work in source code form is that same work.
+
+### 2. Basic Permissions.
+
+All rights granted under this License are granted for the term of copyright on the
+Program, and are irrevocable provided the stated conditions are met. This License
+explicitly affirms your unlimited permission to run the unmodified Program. The
+output from running a covered work is covered by this License only if the output,
+given its content, constitutes a covered work. This License acknowledges your rights
+of fair use or other equivalent, as provided by copyright law.
+
+You may make, run and propagate covered works that you do not convey, without
+conditions so long as your license otherwise remains in force. You may convey covered
+works to others for the sole purpose of having them make modifications exclusively
+for you, or provide you with facilities for running those works, provided that you
+comply with the terms of this License in conveying all material for which you do not
+control copyright. Those thus making or running the covered works for you must do so
+exclusively on your behalf, under your direction and control, on terms that prohibit
+them from making any copies of your copyrighted material outside their relationship
+with you.
+
+Conveying under any other circumstances is permitted solely under the conditions
+stated below. Sublicensing is not allowed; section 10 makes it unnecessary.
+
+### 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+No covered work shall be deemed part of an effective technological measure under any
+applicable law fulfilling obligations under article 11 of the WIPO copyright treaty
+adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention
+of such measures.
+
+When you convey a covered work, you waive any legal power to forbid circumvention of
+technological measures to the extent such circumvention is effected by exercising
+rights under this License with respect to the covered work, and you disclaim any
+intention to limit operation or modification of the work as a means of enforcing,
+against the work's users, your or third parties' legal rights to forbid circumvention
+of technological measures.
+
+### 4. Conveying Verbatim Copies.
+
+You may convey verbatim copies of the Program's source code as you receive it, in any
+medium, provided that you conspicuously and appropriately publish on each copy an
+appropriate copyright notice; keep intact all notices stating that this License and
+any non-permissive terms added in accord with section 7 apply to the code; keep
+intact all notices of the absence of any warranty; and give all recipients a copy of
+this License along with the Program.
+
+You may charge any price or no price for each copy that you convey, and you may offer
+support or warranty protection for a fee.
+
+### 5. Conveying Modified Source Versions.
+
+You may convey a work based on the Program, or the modifications to produce it from
+the Program, in the form of source code under the terms of section 4, provided that
+you also meet all of these conditions:
+
+* **a)** The work must carry prominent notices stating that you modified it, and giving a
+relevant date.
+* **b)** The work must carry prominent notices stating that it is released under this
+License and any conditions added under section 7. This requirement modifies the
+requirement in section 4 to “keep intact all notices”.
+* **c)** You must license the entire work, as a whole, under this License to anyone who
+comes into possession of a copy. This License will therefore apply, along with any
+applicable section 7 additional terms, to the whole of the work, and all its parts,
+regardless of how they are packaged. This License gives no permission to license the
+work in any other way, but it does not invalidate such permission if you have
+separately received it.
+* **d)** If the work has interactive user interfaces, each must display Appropriate Legal
+Notices; however, if the Program has interactive interfaces that do not display
+Appropriate Legal Notices, your work need not make them do so.
+
+A compilation of a covered work with other separate and independent works, which are
+not by their nature extensions of the covered work, and which are not combined with
+it such as to form a larger program, in or on a volume of a storage or distribution
+medium, is called an “aggregate” if the compilation and its resulting
+copyright are not used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work in an aggregate
+does not cause this License to apply to the other parts of the aggregate.
+
+### 6. Conveying Non-Source Forms.
+
+You may convey a covered work in object code form under the terms of sections 4 and
+5, provided that you also convey the machine-readable Corresponding Source under the
+terms of this License, in one of these ways:
+
+* **a)** Convey the object code in, or embodied in, a physical product (including a
+physical distribution medium), accompanied by the Corresponding Source fixed on a
+durable physical medium customarily used for software interchange.
+* **b)** Convey the object code in, or embodied in, a physical product (including a
+physical distribution medium), accompanied by a written offer, valid for at least
+three years and valid for as long as you offer spare parts or customer support for
+that product model, to give anyone who possesses the object code either (1) a copy of
+the Corresponding Source for all the software in the product that is covered by this
+License, on a durable physical medium customarily used for software interchange, for
+a price no more than your reasonable cost of physically performing this conveying of
+source, or (2) access to copy the Corresponding Source from a network server at no
+charge.
+* **c)** Convey individual copies of the object code with a copy of the written offer to
+provide the Corresponding Source. This alternative is allowed only occasionally and
+noncommercially, and only if you received the object code with such an offer, in
+accord with subsection 6b.
+* **d)** Convey the object code by offering access from a designated place (gratis or for
+a charge), and offer equivalent access to the Corresponding Source in the same way
+through the same place at no further charge. You need not require recipients to copy
+the Corresponding Source along with the object code. If the place to copy the object
+code is a network server, the Corresponding Source may be on a different server
+(operated by you or a third party) that supports equivalent copying facilities,
+provided you maintain clear directions next to the object code saying where to find
+the Corresponding Source. Regardless of what server hosts the Corresponding Source,
+you remain obligated to ensure that it is available for as long as needed to satisfy
+these requirements.
+* **e)** Convey the object code using peer-to-peer transmission, provided you inform
+other peers where the object code and Corresponding Source of the work are being
+offered to the general public at no charge under subsection 6d.
+
+A separable portion of the object code, whose source code is excluded from the
+Corresponding Source as a System Library, need not be included in conveying the
+object code work.
+
+A “User Product” is either (1) a “consumer product”, which
+means any tangible personal property which is normally used for personal, family, or
+household purposes, or (2) anything designed or sold for incorporation into a
+dwelling. In determining whether a product is a consumer product, doubtful cases
+shall be resolved in favor of coverage. For a particular product received by a
+particular user, “normally used” refers to a typical or common use of
+that class of product, regardless of the status of the particular user or of the way
+in which the particular user actually uses, or expects or is expected to use, the
+product. A product is a consumer product regardless of whether the product has
+substantial commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+“Installation Information” for a User Product means any methods,
+procedures, authorization keys, or other information required to install and execute
+modified versions of a covered work in that User Product from a modified version of
+its Corresponding Source. The information must suffice to ensure that the continued
+functioning of the modified object code is in no case prevented or interfered with
+solely because modification has been made.
+
+If you convey an object code work under this section in, or with, or specifically for
+use in, a User Product, and the conveying occurs as part of a transaction in which
+the right of possession and use of the User Product is transferred to the recipient
+in perpetuity or for a fixed term (regardless of how the transaction is
+characterized), the Corresponding Source conveyed under this section must be
+accompanied by the Installation Information. But this requirement does not apply if
+neither you nor any third party retains the ability to install modified object code
+on the User Product (for example, the work has been installed in ROM).
+
+The requirement to provide Installation Information does not include a requirement to
+continue to provide support service, warranty, or updates for a work that has been
+modified or installed by the recipient, or for the User Product in which it has been
+modified or installed. Access to a network may be denied when the modification itself
+materially and adversely affects the operation of the network or violates the rules
+and protocols for communication across the network.
+
+Corresponding Source conveyed, and Installation Information provided, in accord with
+this section must be in a format that is publicly documented (and with an
+implementation available to the public in source code form), and must require no
+special password or key for unpacking, reading or copying.
+
+### 7. Additional Terms.
+
+“Additional permissions” are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions. Additional
+permissions that are applicable to the entire Program shall be treated as though they
+were included in this License, to the extent that they are valid under applicable
+law. If additional permissions apply only to part of the Program, that part may be
+used separately under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+When you convey a copy of a covered work, you may at your option remove any
+additional permissions from that copy, or from any part of it. (Additional
+permissions may be written to require their own removal in certain cases when you
+modify the work.) You may place additional permissions on material, added by you to a
+covered work, for which you have or can give appropriate copyright permission.
+
+Notwithstanding any other provision of this License, for material you add to a
+covered work, you may (if authorized by the copyright holders of that material)
+supplement the terms of this License with terms:
+
+* **a)** Disclaiming warranty or limiting liability differently from the terms of
+sections 15 and 16 of this License; or
+* **b)** Requiring preservation of specified reasonable legal notices or author
+attributions in that material or in the Appropriate Legal Notices displayed by works
+containing it; or
+* **c)** Prohibiting misrepresentation of the origin of that material, or requiring that
+modified versions of such material be marked in reasonable ways as different from the
+original version; or
+* **d)** Limiting the use for publicity purposes of names of licensors or authors of the
+material; or
+* **e)** Declining to grant rights under trademark law for use of some trade names,
+trademarks, or service marks; or
+* **f)** Requiring indemnification of licensors and authors of that material by anyone
+who conveys the material (or modified versions of it) with contractual assumptions of
+liability to the recipient, for any liability that these contractual assumptions
+directly impose on those licensors and authors.
+
+All other non-permissive additional terms are considered “further
+restrictions” within the meaning of section 10. If the Program as you received
+it, or any part of it, contains a notice stating that it is governed by this License
+along with a term that is a further restriction, you may remove that term. If a
+license document contains a further restriction but permits relicensing or conveying
+under this License, you may add to a covered work material governed by the terms of
+that license document, provided that the further restriction does not survive such
+relicensing or conveying.
+
+If you add terms to a covered work in accord with this section, you must place, in
+the relevant source files, a statement of the additional terms that apply to those
+files, or a notice indicating where to find the applicable terms.
+
+Additional terms, permissive or non-permissive, may be stated in the form of a
+separately written license, or stated as exceptions; the above requirements apply
+either way.
+
+### 8. Termination.
+
+You may not propagate or modify a covered work except as expressly provided under
+this License. Any attempt otherwise to propagate or modify it is void, and will
+automatically terminate your rights under this License (including any patent licenses
+granted under the third paragraph of section 11).
+
+However, if you cease all violation of this License, then your license from a
+particular copyright holder is reinstated (a) provisionally, unless and until the
+copyright holder explicitly and finally terminates your license, and (b) permanently,
+if the copyright holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+Moreover, your license from a particular copyright holder is reinstated permanently
+if the copyright holder notifies you of the violation by some reasonable means, this
+is the first time you have received notice of violation of this License (for any
+work) from that copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+Termination of your rights under this section does not terminate the licenses of
+parties who have received copies or rights from you under this License. If your
+rights have been terminated and not permanently reinstated, you do not qualify to
+receive new licenses for the same material under section 10.
+
+### 9. Acceptance Not Required for Having Copies.
+
+You are not required to accept this License in order to receive or run a copy of the
+Program. Ancillary propagation of a covered work occurring solely as a consequence of
+using peer-to-peer transmission to receive a copy likewise does not require
+acceptance. However, nothing other than this License grants you permission to
+propagate or modify any covered work. These actions infringe copyright if you do not
+accept this License. Therefore, by modifying or propagating a covered work, you
+indicate your acceptance of this License to do so.
+
+### 10. Automatic Licensing of Downstream Recipients.
+
+Each time you convey a covered work, the recipient automatically receives a license
+from the original licensors, to run, modify and propagate that work, subject to this
+License. You are not responsible for enforcing compliance by third parties with this
+License.
+
+An “entity transaction” is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an organization, or
+merging organizations. If propagation of a covered work results from an entity
+transaction, each party to that transaction who receives a copy of the work also
+receives whatever licenses to the work the party's predecessor in interest had or
+could give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if the predecessor
+has it or can get it with reasonable efforts.
+
+You may not impose any further restrictions on the exercise of the rights granted or
+affirmed under this License. For example, you may not impose a license fee, royalty,
+or other charge for exercise of rights granted under this License, and you may not
+initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging
+that any patent claim is infringed by making, using, selling, offering for sale, or
+importing the Program or any portion of it.
+
+### 11. Patents.
+
+A “contributor” is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The work thus
+licensed is called the contributor's “contributor version”.
+
+A contributor's “essential patent claims” are all patent claims owned or
+controlled by the contributor, whether already acquired or hereafter acquired, that
+would be infringed by some manner, permitted by this License, of making, using, or
+selling its contributor version, but do not include claims that would be infringed
+only as a consequence of further modification of the contributor version. For
+purposes of this definition, “control” includes the right to grant patent
+sublicenses in a manner consistent with the requirements of this License.
+
+Each contributor grants you a non-exclusive, worldwide, royalty-free patent license
+under the contributor's essential patent claims, to make, use, sell, offer for sale,
+import and otherwise run, modify and propagate the contents of its contributor
+version.
+
+In the following three paragraphs, a “patent license” is any express
+agreement or commitment, however denominated, not to enforce a patent (such as an
+express permission to practice a patent or covenant not to sue for patent
+infringement). To “grant” such a patent license to a party means to make
+such an agreement or commitment not to enforce a patent against the party.
+
+If you convey a covered work, knowingly relying on a patent license, and the
+Corresponding Source of the work is not available for anyone to copy, free of charge
+and under the terms of this License, through a publicly available network server or
+other readily accessible means, then you must either (1) cause the Corresponding
+Source to be so available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner consistent with
+the requirements of this License, to extend the patent license to downstream
+recipients. “Knowingly relying” means you have actual knowledge that, but
+for the patent license, your conveying the covered work in a country, or your
+recipient's use of the covered work in a country, would infringe one or more
+identifiable patents in that country that you have reason to believe are valid.
+
+If, pursuant to or in connection with a single transaction or arrangement, you
+convey, or propagate by procuring conveyance of, a covered work, and grant a patent
+license to some of the parties receiving the covered work authorizing them to use,
+propagate, modify or convey a specific copy of the covered work, then the patent
+license you grant is automatically extended to all recipients of the covered work and
+works based on it.
+
+A patent license is “discriminatory” if it does not include within the
+scope of its coverage, prohibits the exercise of, or is conditioned on the
+non-exercise of one or more of the rights that are specifically granted under this
+License. You may not convey a covered work if you are a party to an arrangement with
+a third party that is in the business of distributing software, under which you make
+payment to the third party based on the extent of your activity of conveying the
+work, and under which the third party grants, to any of the parties who would receive
+the covered work from you, a discriminatory patent license (a) in connection with
+copies of the covered work conveyed by you (or copies made from those copies), or (b)
+primarily for and in connection with specific products or compilations that contain
+the covered work, unless you entered into that arrangement, or that patent license
+was granted, prior to 28 March 2007.
+
+Nothing in this License shall be construed as excluding or limiting any implied
+license or other defenses to infringement that may otherwise be available to you
+under applicable patent law.
+
+### 12. No Surrender of Others' Freedom.
+
+If conditions are imposed on you (whether by court order, agreement or otherwise)
+that contradict the conditions of this License, they do not excuse you from the
+conditions of this License. If you cannot convey a covered work so as to satisfy
+simultaneously your obligations under this License and any other pertinent
+obligations, then as a consequence you may not convey it at all. For example, if you
+agree to terms that obligate you to collect a royalty for further conveying from
+those to whom you convey the Program, the only way you could satisfy both those terms
+and this License would be to refrain entirely from conveying the Program.
+
+### 13. Use with the GNU Affero General Public License.
+
+Notwithstanding any other provision of this License, you have permission to link or
+combine any covered work with a work licensed under version 3 of the GNU Affero
+General Public License into a single combined work, and to convey the resulting work.
+The terms of this License will continue to apply to the part which is the covered
+work, but the special requirements of the GNU Affero General Public License, section
+13, concerning interaction through a network will apply to the combination as such.
+
+### 14. Revised Versions of this License.
+
+The Free Software Foundation may publish revised and/or new versions of the GNU
+General Public License from time to time. Such new versions will be similar in spirit
+to the present version, but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program specifies that
+a certain numbered version of the GNU General Public License “or any later
+version” applies to it, you have the option of following the terms and
+conditions either of that numbered version or of any later version published by the
+Free Software Foundation. If the Program does not specify a version number of the GNU
+General Public License, you may choose any version ever published by the Free
+Software Foundation.
+
+If the Program specifies that a proxy can decide which future versions of the GNU
+General Public License can be used, that proxy's public statement of acceptance of a
+version permanently authorizes you to choose that version for the Program.
+
+Later license versions may give you additional or different permissions. However, no
+additional obligations are imposed on any author or copyright holder as a result of
+your choosing to follow a later version.
+
+### 15. Disclaimer of Warranty.
+
+THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM “AS IS” WITHOUT WARRANTY OF ANY KIND, EITHER
+EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE
+QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE
+DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+### 16. Limitation of Liability.
+
+IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY
+COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS
+PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL,
+INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE
+OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE
+WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+### 17. Interpretation of Sections 15 and 16.
+
+If the disclaimer of warranty and limitation of liability provided above cannot be
+given local legal effect according to their terms, reviewing courts shall apply local
+law that most closely approximates an absolute waiver of all civil liability in
+connection with the Program, unless a warranty or assumption of liability accompanies
+a copy of the Program in return for a fee.
\ No newline at end of file
diff --git a/README.md b/README.md
index c8f3d64..69111be 100644
--- a/README.md
+++ b/README.md
@@ -1,15 +1,82 @@
# raven-python
-The Python counterpart of the
-[RAVEN Toolbox 2](https://github.com/SysBioChalmers/RAVEN) (MATLAB), built on
-[cobrapy](https://github.com/opencobra/cobrapy).
-
-`raven-python` covers de-novo reconstruction (KEGG + protein homology),
-context-specific model extraction (`tINIT` / `ftINIT`), metabolic-task
-validation, gap-filling, omics ingestion, sub-cellular localisation, model
-manipulation, and YAML / SIF / Excel I/O — preserving the established RAVEN
-workflows in a Python-native form.
-
-This `main` branch is intentionally empty. Development happens on the
-`develop` branch via a series of feature branches; see the open and merged
-pull requests for the current state of the port.
+[](https://github.com/SysBioChalmers/raven-python/actions/workflows/ci.yml)
+
+**Reconstruction, Analysis and Visualisation of Metabolic Networks — in Python.**
+
+`raven-python` is the Python counterpart of the
+[RAVEN Toolbox 2](https://github.com/SysBioChalmers/RAVEN) (MATLAB). It builds on
+[**cobrapy**](https://github.com/opencobra/cobrapy) for everything cobrapy already does
+well (simulation, standard analyses, SBML I/O, model manipulation) and adds the
+functionality that's unique to RAVEN:
+
+* **De novo reconstruction** from KEGG and protein homology (BLAST / DIAMOND).
+* **Context-specific models** from omics data via **tINIT / ftINIT**, with task-aware
+ gap-filling and the linear-merge MILP reduction.
+* **Metabolic-task** validation (`check_tasks`, `fitTasks`).
+* **Connectivity gap-filling** against template models.
+* **Omics integration** — Human Protein Atlas (proteomics + RNA-seq) ingestion.
+* **Sub-cellular localisation** prediction by MILP, with partial-update mode and
+ pluggable predictors (WoLF PSORT, DeepLoc, …).
+* **N-model comparison**; **reporter metabolites**; **FSEOF**; **flux sampling**.
+* **YAML I/O** following the cobra standard, plus geckopy's `ec-*` enzyme-constrained
+ fields. **SIF** export. **RAVEN-style Excel** export.
+
+The status of every RAVEN function (ported, cheatsheet-mapped to cobra, or explicitly
+not ported) is documented function-by-function in
+**[docs/raven_migration.md](docs/raven_migration.md)**.
+
+## Design principle
+
+The canonical in-memory object is always a [`cobra.Model`](https://cobrapy.readthedocs.io).
+There is no parallel RAVEN struct, no `ravenCobraWrapper`-style adapter. RAVEN-specific
+fields that cobra doesn't model natively (`rxnMiriams`, `metDeltaG`,
+`rxnConfidenceScores`, …) live in cobra's `annotation` / `notes` dictionaries. This
+avoids duplicating cobra's data model and keeps raven-python interoperable with the wider
+COBRA ecosystem.
+
+## Status
+
+raven-python has been validated against MATLAB RAVEN on **Human-GEM** (5 Hart2015 cell-line
+models, Jaccard 0.975–0.980 — see [docs/humangem_validation.md](docs/humangem_validation.md)).
+The functional scope of the original RAVEN toolbox is covered with two principled
+omissions:
+
+* **MetaCyc-based reconstruction** is not implemented and is flagged for removal from
+ MATLAB RAVEN as well — see [IMPROVEMENTS.md](IMPROVEMENTS.md) under `R-MetaCyc`.
+* **Dynamic FBA** is not implemented — several maintained Python packages already cover
+ it ([`dfba`](https://pypi.org/project/dfba/), [`reframed`](https://pypi.org/project/reframed/),
+ [`mewpy`](https://pypi.org/project/mewpy/)).
+
+What's still open is catalogued in **[docs/todo.md](docs/todo.md)** (visualisation / Phase
+6 is the main item).
+
+## Installation (development)
+
+```bash
+git clone https://github.com/SysBioChalmers/raven-python
+cd raven-python
+pip install -e ".[dev]"
+```
+
+raven-python requires Python ≥ 3.11. Genome-scale (f)tINIT MILPs currently require **Gurobi**
+([details on solver portability](docs/init_solver_benchmark.md)); toy and unit-test work
+runs on the open-source GLPK.
+
+## Documentation
+
+See **[docs/README.md](docs/README.md)** for the documentation index.
+
+## Relationship to MATLAB RAVEN
+
+`raven-python` is a derivative work and is released under the same **GPL-3.0-or-later**
+license. If you use it in scientific work, please cite the RAVEN 2 paper:
+
+> Wang H, Marcišauskas S, Sánchez BJ, Domenzain I, Hermansson D, Agren R, Nielsen J,
+> Kerkhoven EJ. (2018) RAVEN 2.0: A versatile toolbox for metabolic network
+> reconstruction and a case study on *Streptomyces coelicolor*. PLoS Comput Biol 14(10):
+> e1006541.
+
+## License
+
+[GPL-3.0-or-later](LICENSE)
diff --git a/docs/maintaining_binaries.md b/docs/maintaining_binaries.md
new file mode 100644
index 0000000..df5b315
--- /dev/null
+++ b/docs/maintaining_binaries.md
@@ -0,0 +1,236 @@
+# Maintaining bundled binaries (BLAST+, DIAMOND, …)
+
+Audience: **raven-python maintainers / the GitHub repo owner.** This explains how
+raven-python ships external command-line tools, how to update their versions, and how
+to build **minimal-footprint** ZIPs to attach to a GitHub release.
+
+> End users never read this. They get a binary automatically via `ensure_binary`,
+> or use their own (system/conda) install. This doc is only for whoever publishes
+> the release assets.
+
+---
+
+## 1. How binary provisioning works
+
+raven-python does **not** vendor binaries in the git repo or on PyPI. Instead:
+
+1. For each tool we publish **version-pinned ZIPs as GitHub release assets**.
+2. A **registry** (`src/raven_python/binaries_registry.json`) maps each *bundle* to its
+ version, the executables it provides, and per-platform `{asset, sha256}`.
+3. At run time `raven_python.binaries.ensure_binary("blastp")` resolves a tool in this
+ order — and only reaches the download as a last resort:
+
+ ```
+ explicit binary= arg → env var (RAVEN_PYTHON_BLASTP / RAVEN_PYTHON_DIAMOND / …)
+ → shutil.which on PATH (system / conda / apt / brew)
+ → ensure_binary: download the pinned ZIP → verify SHA256 → cache → return path
+ → actionable error (with conda / manual instructions)
+ ```
+
+So a pre-installed binary always wins; the bundle is the zero-setup fallback.
+Pinning the version makes reconstruction **reproducible**.
+
+A *bundle* can provide several executables from one download (e.g. the `blast`
+bundle provides both `blastp` and `makeblastdb`), so they are fetched once.
+
+---
+
+## 2. What raven-python actually needs — ship only these
+
+Distribute the **minimum** set of executables. Everything else (other suite
+tools, docs, examples, changelogs) must be excluded.
+
+| Bundle | Executables to include | Everything else |
+|---|---|---|
+| `diamond` | `diamond` | — (it is a single static binary) |
+| `blast` | `blastp`, `makeblastdb` | **drop** `blastn`, `tblastn`, `psiblast`, `rpsblast`, `blast_formatter`, `*_vdb`, the `doc/`, `ChangeLog`, `README`, ~30 other tools |
+
+(Confirmed against RAVEN `getBlast`/`getDiamond`: only `makeblastdb`+`blastp`, and
+`diamond` for its `makedb`/`blastp` subcommands, are ever invoked.)
+
+For BLAST+ this is the big win: the full NCBI suite is ~hundreds of MB; two
+binaries (stripped) are a small fraction.
+
+---
+
+## 3. Asset & ZIP conventions
+
+**Asset filename:** `---.zip`
+
+- `` ∈ `linux`, `macos`, `windows`
+- `` ∈ `x86_64`, `arm64`
+- examples: `diamond-2.1.11-linux-x86_64.zip`, `blast-2.16.0-macos-arm64.zip`
+
+**ZIP layout — flat, executables at the root, plus the upstream licence:**
+
+```
+diamond-2.1.11-linux-x86_64.zip
+├── diamond
+└── LICENSE
+
+blast-2.16.0-linux-x86_64.zip
+├── blastp
+├── makeblastdb
+└── LICENSE
+```
+
+No nested `bin/`, no extra files. `ensure_binary` extracts the ZIP into the cache
+and expects the executable at the top level.
+
+---
+
+## 4. Step-by-step: add or update a version
+
+Example: bump DIAMOND to a new version for Linux x86-64. Repeat per `(os, arch)`.
+
+1. **Download the official upstream build** (never rebuild from source unless you
+ must):
+ - DIAMOND →
+ (`diamond-linux64.tar.gz`, `diamond-macos.tar.gz`)
+ - BLAST+ → or a
+ pinned version dir (`ncbi-blast-+-x64-linux.tar.gz`,
+ `-x64-macosx.tar.gz`, `-aarch64-linux.tar.gz`, `-x64-win64.tar.gz`).
+ - Record the upstream URL **and** its published checksum for provenance.
+2. **Extract only the needed executables** (see §2) to a clean staging dir.
+3. **Strip debug symbols** to shrink (skip on Windows / signed macOS builds):
+ ```bash
+ strip diamond # or: strip blastp makeblastdb
+ ```
+4. **Smoke-test the stripped binaries in a clean shell** (no other tools on PATH):
+ ```bash
+ ./diamond --version
+ ./blastp -version && ./makeblastdb -version
+ ```
+ If they fail for a missing shared library, add that `.so`/`.dylib` to the ZIP
+ (rare — NCBI/DIAMOND release builds are largely self-contained).
+5. **Add the upstream licence file** as `LICENSE` (see §6).
+6. **Zip with max compression, flat layout:**
+ ```bash
+ zip -9 -j diamond-2.1.11-linux-x86_64.zip diamond LICENSE
+ # -j junks paths so entries sit at the ZIP root
+ ```
+7. **Compute the SHA256:**
+ ```bash
+ sha256sum diamond-2.1.11-linux-x86_64.zip # shasum -a 256 on macOS
+ ```
+8. **Attach the ZIP to a raven-python GitHub release** (a release tagged for the binary
+ set, e.g. `binaries-2024.06`, keeps them independent of code releases).
+9. **Update the registry** `src/raven_python/binaries_registry.json` — bump `version`
+ and set the per-platform `asset` + `sha256`:
+ ```json
+ {
+ "diamond": {
+ "version": "2.1.11",
+ "provides": ["diamond"],
+ "platforms": {
+ "linux-x86_64": {
+ "asset": "diamond-2.1.11-linux-x86_64.zip",
+ "url": "https://github.com/SysBioChalmers/raven-python/releases/download/binaries-2024.06/diamond-2.1.11-linux-x86_64.zip",
+ "sha256": ""
+ }
+ }
+ },
+ "blast": {
+ "version": "2.16.0",
+ "provides": ["blastp", "makeblastdb"],
+ "platforms": { "linux-x86_64": { "asset": "...", "url": "...", "sha256": "..." } }
+ }
+ }
+ ```
+10. **Commit the registry change**, run the homology tests, and (if you have the
+ binary) confirm `ensure_binary("diamond", version="2.1.11")` downloads,
+ verifies, and runs.
+
+---
+
+## 5. Keeping the footprint minimal — checklist
+
+- ✅ Only the executables in §2 (for BLAST+, exactly `blastp` + `makeblastdb`).
+- ✅ `strip` the binaries (often halves their size).
+- ✅ `zip -9 -j` (max compression, flat — no `bin/`, no folders).
+- ✅ Exactly one extra file: `LICENSE`.
+- ❌ No docs, examples, `ChangeLog`, `README`, man pages, test data, or sibling tools.
+- ❌ No `.dSYM`/debug bundles; no duplicate static `.a` libraries.
+- ➕ Only add a shared library if step-4 testing proves it is required.
+
+---
+
+## 6. Platform / architecture matrix & licensing
+
+**Coverage = what you build.** Start with `linux-x86_64` (CI default), then add
+`macos-arm64`, `macos-x86_64`, `linux-arm64`, `windows-x86_64` as capacity allows.
+For any `(os, arch)` **not** in the registry, `ensure_binary` raises an actionable
+error pointing to conda (`conda install -c bioconda diamond blast`) or a manual
+install — that is the documented fallback, not a failure to fix urgently.
+
+**Licensing (must comply when redistributing):**
+
+- **BLAST+** — produced by NCBI (US Government); **public domain**, free to
+ redistribute. Include NCBI's `LICENSE` for courtesy/provenance.
+- **DIAMOND** — **GPLv3**. Redistribution is allowed; you **must** include the
+ GPLv3 licence text in the ZIP and keep the binary unmodified (or offer source).
+- **HMMER** (future) — BSD-3-Clause; include its `LICENSE`.
+
+Always ship the upstream licence in the ZIP, and keep a `BINARIES_PROVENANCE.md`
+(or a note in the release body) recording, per asset: upstream URL, upstream
+version, upstream checksum, and the SHA256 you published.
+
+### Native OS support per tool
+
+raven-python invokes each tool through `subprocess.run([resolved_path, …])` — that
+call is itself cross-platform, so the real constraint is whether a given tool has
+a binary that runs natively on each OS. It varies:
+
+| Tool | Linux | macOS (incl. arm64) | Windows (native) |
+|---|---|---|---|
+| BLAST+ (`blastp`, `makeblastdb`) | ✅ | ✅ | ✅ (NCBI ships Windows builds) |
+| DIAMOND | ✅ | ✅ | ⚠️ native build exists but Linux-first |
+| HMMER (`hmmbuild`/`hmmpress`/`hmmsearch`/`hmmscan`) | ✅ | ✅ | ❌ no official native build |
+| MAFFT | ✅ | ✅ | ⚠️ Windows package is a wrapper |
+| CD-HIT | ✅ | ✅ | ❌ no Windows build exists |
+
+Implications:
+
+- **Linux / macOS** — everything works. `conda install -c bioconda hmmer mafft
+ cd-hit blast diamond`, or point the `RAVEN_PYTHON_*` env vars at your installs.
+- **Native Windows** — the homology track (BLAST+/DIAMOND) works, but the **KEGG
+ HMM build (3b.3) and HMM query (3b.5) do not**: HMMER and CD-HIT have no Windows
+ binaries, and bioconda has no Windows packages for any of them. Bundling can't
+ fix this — there is no binary to bundle.
+- **Windows users should run raven-python inside WSL2** (or a Linux container), where
+ every tool is native Linux. raven-python does **not** replicate RAVEN's
+ `getWSLpath`/`wsl …` path translation: it calls the resolved binary directly, so
+ mixing native-Windows Python with WSL binaries is unsupported — keep the whole
+ stack inside WSL2.
+- The common end-user paths — homology reconstruction and the KEGG *species* model
+ (3b.4) — need no HMMER/MAFFT/CD-HIT, so they are fully cross-platform.
+
+---
+
+## 7. Emitting the registry entry
+
+After building the per-platform ZIPs (named `---.zip`)
+and uploading them to the release, generate the `_REGISTRY` entry — checksums and
+URLs — with [`scripts/make_registry_snippet.py`](../scripts/README.md):
+
+```bash
+python scripts/make_registry_snippet.py binary --bundle blast --version 2.16.0 \
+ --provides blastp makeblastdb --dir zips \
+ --base-url https://github.com/ORG/raven-python/releases/download/blast-2.16.0
+```
+
+It prints the ready-to-paste `_REGISTRY["blast"]` block; its SHA256 helper is the
+same one `ensure_binary` verifies with, so the checksums always match. (Producing
+the minimal ZIPs themselves — download upstream, `strip`, `zip -9 -j`, add
+`LICENSE` per §3–§6 — is still a manual/per-tool step.)
+
+---
+
+## 8. Adding a new tool later (e.g. HMMER for KEGG reconstruction)
+
+1. Decide the **minimal executable set** (e.g. HMMER → `hmmsearch`, `hmmscan`,
+ maybe `hmmbuild`/`hmmpress`).
+2. Add a bundle entry to the registry with `provides` listing those executables.
+3. Build/attach ZIPs per §3–§4; include the tool's licence (§6).
+4. The wrappers call `ensure_binary("hmmsearch", …)` with the same resolution
+ order — no new provisioning code needed.
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..faeeb1f
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,84 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "raven-python"
+version = "0.0.1"
+description = "Reconstruction, Analysis and Visualization of Metabolic Networks in Python, a port of the RAVEN Toolbox built on cobrapy"
+readme = "README.md"
+requires-python = ">=3.11"
+license = { text = "GPL-3.0-or-later" }
+authors = [
+ { name = "Eduard Kerkhoven", email = "eduardk@chalmers.se" },
+]
+keywords = [
+ "genome-scale-model",
+ "metabolic-model",
+ "reconstruction",
+ "raven",
+ "cobra",
+ "systems-biology",
+ "constraint-based-modeling",
+ "kegg",
+ "metacyc",
+ "tinit",
+]
+classifiers = [
+ "Development Status :: 2 - Pre-Alpha",
+ "Intended Audience :: Science/Research",
+ "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
+ "Operating System :: OS Independent",
+ "Programming Language :: Python :: 3",
+ "Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12",
+ "Programming Language :: Python :: 3.13",
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
+]
+dependencies = [
+ "cobra>=0.29",
+ "numpy>=1.21",
+ "pandas>=2",
+ "scipy>=1.10",
+ "ruamel.yaml>=0.17",
+ "requests>=2.28",
+ "tqdm>=4.65",
+]
+
+[project.optional-dependencies]
+dev = [
+ "pytest>=7",
+ "pytest-cov",
+ "ruff>=0.4",
+]
+excel = [
+ "openpyxl>=3.1",
+]
+plotting = [
+ "matplotlib>=3.5",
+]
+
+[project.urls]
+Homepage = "https://github.com/SysBioChalmers/raven-python"
+Source = "https://github.com/SysBioChalmers/raven-python"
+Issues = "https://github.com/SysBioChalmers/raven-python/issues"
+"RAVEN MATLAB" = "https://github.com/SysBioChalmers/RAVEN"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/raven_python"]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+
+[tool.ruff]
+line-length = 100
+target-version = "py311"
+
+[tool.ruff.lint]
+select = ["E", "F", "W", "I", "UP", "B"]
+ignore = [
+ "E501", # line length handled by the formatter
+]
+
+[tool.ruff.lint.per-file-ignores]
+"tests/*" = ["E402"]
diff --git a/scripts/README.md b/scripts/README.md
new file mode 100644
index 0000000..67a3b36
--- /dev/null
+++ b/scripts/README.md
@@ -0,0 +1,35 @@
+# Maintainer scripts
+
+Release-time tooling. Not part of the installed package — run them from a checkout
+with raven-python installed (`pip install -e .`). End users never need these.
+
+## `build_kegg_artefacts.py`
+
+Build the publishable KEGG artefact set from an arranged KEGG dump (see
+`download_kegg_dump`): the gzipped-YAML reference model, the gzipped-TSV tables,
+and (with `--hmms`) the per-domain pressed HMM libraries. Output is laid out ready
+to upload as release assets. See [docs/maintaining_kegg_data.md](../docs/maintaining_kegg_data.md).
+
+```bash
+python scripts/build_kegg_artefacts.py --keggdb keggdb --out artefacts # tables + model
+python scripts/build_kegg_artefacts.py --keggdb keggdb --out artefacts --hmms --threads 8
+```
+
+## `make_registry_snippet.py`
+
+After uploading the files to a release, compute their SHA256 and print the entry
+to merge into the runtime registry — `raven_python.data._DATA_REGISTRY` (data) or
+`raven_python.binaries._REGISTRY` (binary ZIP bundles). The checksum helper is shared
+with the resolvers, so published checksums always match what `ensure_data` /
+`ensure_binary` verify.
+
+```bash
+# Data artefacts:
+python scripts/make_registry_snippet.py data --dataset kegg --version kegg116 \
+ --dir artefacts --base-url https://github.com/ORG/raven-python/releases/download/kegg-data-kegg116
+
+# Binary bundle (ZIPs named ---.zip):
+python scripts/make_registry_snippet.py binary --bundle blast --version 2.16.0 \
+ --provides blastp makeblastdb --dir zips \
+ --base-url https://github.com/ORG/raven-python/releases/download/blast-2.16.0
+```
diff --git a/scripts/make_registry_snippet.py b/scripts/make_registry_snippet.py
new file mode 100644
index 0000000..3efa49e
--- /dev/null
+++ b/scripts/make_registry_snippet.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python
+"""Emit ready-to-paste registry entries for published artefacts / binary ZIPs.
+
+Computes the SHA256 of each file and prints the Python/JSON entry to merge into
+``raven_python.data._DATA_REGISTRY`` (data artefacts) or ``raven_python.binaries._REGISTRY``
+(binary bundles). Run once per release, after uploading the files to the release.
+
+Examples
+--------
+Data artefacts (KEGG reference model + tables + HMM libraries) for one release::
+
+ python scripts/make_registry_snippet.py data \\
+ --dataset kegg --version kegg116 --dir artefacts \\
+ --base-url https://github.com/ORG/raven_python/releases/download/kegg-data-kegg116
+
+Binary bundle (one ZIP per platform, named ``---.zip``)::
+
+ python scripts/make_registry_snippet.py binary \\
+ --bundle blast --version 2.16.0 --provides blastp makeblastdb --dir zips \\
+ --base-url https://github.com/ORG/raven_python/releases/download/blast-2.16.0
+
+The SHA256 helper is shared with the runtime resolvers (``raven_python.binaries``), so
+published checksums always match what ``ensure_data`` / ``ensure_binary`` verify.
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+from raven_python.binaries import _sha256
+
+
+def _files_in(directory: Path) -> list[Path]:
+ """Regular, non-hidden files in ``directory``, sorted by name."""
+ return sorted(p for p in directory.iterdir() if p.is_file() and not p.name.startswith("."))
+
+
+def data_entry(dataset: str, version: str, base_url: str, directory: Path) -> dict:
+ """Build the ``_DATA_REGISTRY[dataset]`` entry for every file in ``directory``."""
+ base = base_url.rstrip("/")
+ files = {
+ p.name: {"url": f"{base}/{p.name}", "sha256": _sha256(p)} for p in _files_in(directory)
+ }
+ if not files:
+ raise SystemExit(f"No files found in {directory}")
+ return {"version": version, "files": files}
+
+
+def binary_entry(
+ bundle: str, version: str, provides: list[str], base_url: str, directory: Path
+) -> dict:
+ """Build the ``_REGISTRY[bundle]`` entry from ``---.zip``."""
+ base = base_url.rstrip("/")
+ prefix = f"{bundle}-{version}-"
+ platforms = {}
+ for zip_path in directory.glob(f"{prefix}*.zip"):
+ platform = zip_path.name[len(prefix) : -len(".zip")]
+ platforms[platform] = {"url": f"{base}/{zip_path.name}", "sha256": _sha256(zip_path)}
+ if not platforms:
+ raise SystemExit(f"No {prefix}*.zip files found in {directory}")
+ return {"version": version, "provides": provides, "platforms": dict(sorted(platforms.items()))}
+
+
+def render(key: str, entry: dict) -> str:
+ """Render ``{key: entry}`` as an indented JSON block (valid Python to paste)."""
+ return json.dumps({key: entry}, indent=4)
+
+
+def main(argv: list[str] | None = None) -> None:
+ parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+ sub = parser.add_subparsers(dest="kind", required=True)
+
+ d = sub.add_parser("data", help="data-artefact registry entry (raven_python.data)")
+ d.add_argument("--dataset", required=True, help="dataset key, e.g. 'kegg'")
+ d.add_argument("--version", required=True)
+ d.add_argument("--dir", required=True, type=Path, help="directory of uploaded artefacts")
+ d.add_argument("--base-url", required=True, help="release download URL prefix")
+
+ b = sub.add_parser("binary", help="binary-bundle registry entry (raven_python.binaries)")
+ b.add_argument("--bundle", required=True, help="bundle key, e.g. 'blast'")
+ b.add_argument("--version", required=True)
+ b.add_argument("--provides", nargs="+", required=True, help="executables the bundle provides")
+ b.add_argument("--dir", required=True, type=Path, help="directory of uploaded ZIPs")
+ b.add_argument("--base-url", required=True, help="release download URL prefix")
+
+ args = parser.parse_args(argv)
+ if args.kind == "data":
+ key, entry = args.dataset, data_entry(args.dataset, args.version, args.base_url, args.dir)
+ target = "raven_python/data.py _DATA_REGISTRY"
+ else:
+ key = args.bundle
+ entry = binary_entry(args.bundle, args.version, args.provides, args.base_url, args.dir)
+ target = "raven_python/binaries.py _REGISTRY"
+
+ print(f"# Merge into {target}:", file=sys.stderr)
+ print(render(key, entry))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/src/raven_python/__init__.py b/src/raven_python/__init__.py
new file mode 100644
index 0000000..591c4c1
--- /dev/null
+++ b/src/raven_python/__init__.py
@@ -0,0 +1,10 @@
+"""raven_python — Python counterpart of the RAVEN Toolbox, built on cobrapy.
+
+raven_python reuses cobrapy for simulation, standard analyses, SBML I/O, and model
+manipulation, and provides the RAVEN-specific functionality on top: de novo
+reconstruction (KEGG / homology), context-specific modeling (tINIT / ftINIT),
+metabolic task validation, connectivity gap-filling, omics integration (HPA),
+sub-cellular localisation, N-model comparison, and the RAVEN-style I/O formats.
+"""
+
+__version__ = "0.0.1"
diff --git a/src/raven_python/binaries.py b/src/raven_python/binaries.py
new file mode 100644
index 0000000..2d4b5a2
--- /dev/null
+++ b/src/raven_python/binaries.py
@@ -0,0 +1,148 @@
+"""Locate and provision external command-line binaries (BLAST+, DIAMOND, …).
+
+Shared across tools (not homology-specific). Resolution order for any executable:
+
+ explicit path arg → env var (RAVEN_PYTHON_) → shutil.which (PATH)
+ → ensure_binary (download the version-pinned ZIP from a raven_python release,
+ verify SHA256, cache, return the path)
+ → FileNotFoundError with install guidance
+
+So a pre-installed/conda binary always wins; the bundled ZIP is the zero-setup
+fallback. See docs/maintaining_binaries.md for how the release ZIPs and the
+registry are produced and updated.
+"""
+from __future__ import annotations
+
+import hashlib
+import os
+import platform
+import shutil
+import zipfile
+from pathlib import Path
+from urllib.request import urlopen
+
+# Registry of bundled binaries. Empty until release ZIPs are published; populated
+# per docs/maintaining_binaries.md. Keyed by *bundle*; one bundle can provide
+# several executables (e.g. "blast" -> blastp + makeblastdb).
+# bundle -> {version, provides:[exe...], platforms:{"-": {url, sha256}}}
+_REGISTRY: dict = {}
+
+# Environment variable overrides per executable.
+_ENV_VARS = {
+ "diamond": "RAVEN_PYTHON_DIAMOND",
+ "blastp": "RAVEN_PYTHON_BLASTP",
+ "makeblastdb": "RAVEN_PYTHON_MAKEBLASTDB",
+ "hmmbuild": "RAVEN_PYTHON_HMMBUILD",
+ "hmmpress": "RAVEN_PYTHON_HMMPRESS",
+ "hmmsearch": "RAVEN_PYTHON_HMMSEARCH",
+ "hmmscan": "RAVEN_PYTHON_HMMSCAN",
+ "mafft": "RAVEN_PYTHON_MAFFT",
+ "cd-hit": "RAVEN_PYTHON_CDHIT",
+}
+
+
+def platform_key() -> str:
+ """Return the ``-`` key used in the registry (e.g. ``linux-x86_64``)."""
+ system = {"linux": "linux", "darwin": "macos", "windows": "windows"}.get(
+ platform.system().lower(), platform.system().lower()
+ )
+ machine = platform.machine().lower()
+ arch = {"x86_64": "x86_64", "amd64": "x86_64", "arm64": "arm64", "aarch64": "arm64"}.get(
+ machine, machine
+ )
+ return f"{system}-{arch}"
+
+
+def _cache_dir() -> Path:
+ base = os.environ.get("XDG_CACHE_HOME") or (Path.home() / ".cache")
+ return Path(base) / "raven_python" / "binaries"
+
+
+def _bundle_for(executable: str, registry: dict):
+ for name, bundle in registry.items():
+ if executable in bundle.get("provides", []):
+ return name, bundle
+ return None, None
+
+
+def _sha256(path: Path) -> str:
+ h = hashlib.sha256()
+ with open(path, "rb") as fh:
+ for chunk in iter(lambda: fh.read(1 << 20), b""):
+ h.update(chunk)
+ return h.hexdigest()
+
+
+def ensure_binary(executable: str, *, registry: dict | None = None) -> Path:
+ """Download (if needed) and return the path to a bundled ``executable``.
+
+ Consults the registry for the current platform, downloads the pinned ZIP,
+ verifies its SHA256, extracts it into the cache, and returns the executable
+ path. Raises ``FileNotFoundError`` if no bundle for this platform is hosted.
+ """
+ registry = _REGISTRY if registry is None else registry
+ bundle_name, bundle = _bundle_for(executable, registry)
+ if bundle is None:
+ raise FileNotFoundError(
+ f"No bundled binary registered for {executable!r}. Install it (e.g. "
+ f"`conda install -c bioconda {executable}`) or pass an explicit path."
+ )
+ key = platform_key()
+ entry = bundle.get("platforms", {}).get(key)
+ if entry is None:
+ raise FileNotFoundError(
+ f"No bundled {executable!r} for platform {key!r}. Install it "
+ f"(e.g. `conda install -c bioconda {executable}`), set "
+ f"{_ENV_VARS.get(executable, 'the binary path')}, or pass binary=."
+ )
+
+ dest_dir = _cache_dir() / f"{bundle_name}-{bundle['version']}-{key}"
+ exe = dest_dir / executable
+ if exe.exists():
+ return exe
+
+ dest_dir.mkdir(parents=True, exist_ok=True)
+ archive = dest_dir / "_download.zip"
+ # Download into a sibling .part file and rename on success — an interrupted
+ # download leaves the partial behind .part, never as a half-complete .zip
+ # that a later run might mistake for a finished one. Mirrors data.py.
+ part = archive.with_suffix(archive.suffix + ".part")
+ try:
+ with urlopen(entry["url"]) as resp, open(part, "wb") as out: # noqa: S310
+ shutil.copyfileobj(resp, out)
+ digest = _sha256(part)
+ if digest != entry["sha256"]:
+ raise ValueError(
+ f"SHA256 mismatch for {executable!r} ({key}): "
+ f"expected {entry['sha256']}, got {digest}."
+ )
+ os.replace(part, archive)
+ finally:
+ part.unlink(missing_ok=True)
+ with zipfile.ZipFile(archive) as zf:
+ zf.extractall(dest_dir)
+ archive.unlink(missing_ok=True)
+ if not exe.exists():
+ raise FileNotFoundError(f"{executable!r} not found in the extracted bundle at {dest_dir}.")
+ exe.chmod(0o755)
+ return exe
+
+
+def resolve_binary(executable: str, *, binary: str | os.PathLike | None = None) -> str:
+ """Resolve an executable to a path: arg → env var → PATH → bundled ZIP → error."""
+ if binary is not None:
+ return os.fspath(binary)
+ env_var = _ENV_VARS.get(executable)
+ if env_var and os.environ.get(env_var):
+ return os.environ[env_var]
+ found = shutil.which(executable)
+ if found:
+ return found
+ try:
+ return os.fspath(ensure_binary(executable))
+ except FileNotFoundError as exc:
+ raise FileNotFoundError(
+ f"Could not find {executable!r}. Install it (e.g. "
+ f"`conda install -c bioconda {executable}`), put it on PATH, set "
+ f"{env_var or 'the binary path'}, or pass binary=. ({exc})"
+ ) from exc
diff --git a/src/raven_python/data.py b/src/raven_python/data.py
new file mode 100644
index 0000000..b1264be
--- /dev/null
+++ b/src/raven_python/data.py
@@ -0,0 +1,135 @@
+"""Fetch and cache published data artefacts (KEGG reference model, tables, HMMs).
+
+The mirror of :mod:`raven_python.binaries` for *data*: a version-pinned registry of
+downloadable artefacts, fetched on first use, SHA256-verified, and cached under
+platformdirs so end users never rebuild them from a KEGG dump (that is the
+maintainer's job — see docs/maintaining_kegg_data.md).
+
+Resolution for any artefact file:
+
+ explicit local dir → cached copy → download from the registry (verify,
+ cache) → FileNotFoundError with guidance
+
+The registry is **empty until the artefacts are published** (same as
+``binaries._REGISTRY``); until then ``ensure_data_file`` raises an actionable
+error. Cache layout::
+
+ $XDG_CACHE_HOME/raven_python/data/-/
+ (or ~/.cache/raven_python/data/... if XDG_CACHE_HOME is unset)
+"""
+from __future__ import annotations
+
+import os
+import shutil
+from pathlib import Path
+from urllib.request import urlopen
+
+from raven_python.binaries import _sha256
+
+# dataset -> {"version": str, "files": {filename: {"url": str, "sha256": str}}}
+# Populated when raven_python publishes the KEGG artefacts as release assets.
+_DATA_REGISTRY: dict = {}
+
+# The core KEGG artefacts needed to build a model (no HMM libraries).
+CORE_KEGG_FILES = (
+ "reference_model.yml.gz",
+ "ko_reaction.tsv.gz",
+ "ko_names.tsv.gz",
+ "organism_gene_ko.tsv.xz",
+ "rxn_flags.tsv.gz",
+)
+
+
+def _data_cache_dir() -> Path:
+ base = os.environ.get("XDG_CACHE_HOME") or (Path.home() / ".cache")
+ return Path(base) / "raven_python" / "data"
+
+
+def _bundle(dataset: str, registry: dict) -> dict:
+ bundle = registry.get(dataset)
+ if bundle is None:
+ raise FileNotFoundError(
+ f"No data artefacts registered for {dataset!r}. Either pass a local "
+ f"directory of artefacts, or build them per docs/maintaining_kegg_data.md."
+ )
+ return bundle
+
+
+def ensure_data_file(
+ dataset: str,
+ filename: str,
+ *,
+ version: str | None = None,
+ registry: dict | None = None,
+) -> Path:
+ """Download (if needed) and return the cached path to one artefact file.
+
+ Looks the file up in the registry for ``dataset`` (at ``version`` or the
+ registry's default), downloads it to the version-pinned cache directory,
+ verifies its SHA256, and returns the path. Re-uses an already-cached copy.
+ """
+ registry = _DATA_REGISTRY if registry is None else registry
+ bundle = _bundle(dataset, registry)
+ ver = version or bundle["version"]
+ entry = bundle.get("files", {}).get(filename)
+ if entry is None:
+ raise FileNotFoundError(
+ f"{filename!r} is not registered for {dataset!r} {ver}. "
+ f"Available: {sorted(bundle.get('files', {}))}."
+ )
+
+ dest_dir = _data_cache_dir() / f"{dataset}-{ver}"
+ dest = dest_dir / filename
+ if dest.exists():
+ return dest
+
+ dest_dir.mkdir(parents=True, exist_ok=True)
+ tmp = dest.with_name(dest.name + ".part")
+ with urlopen(entry["url"]) as resp, open(tmp, "wb") as out: # noqa: S310 (trusted registry URLs)
+ shutil.copyfileobj(resp, out)
+ digest = _sha256(tmp)
+ if digest != entry["sha256"]:
+ tmp.unlink(missing_ok=True)
+ raise ValueError(
+ f"SHA256 mismatch for {dataset}/{filename} ({ver}): "
+ f"expected {entry['sha256']}, got {digest}."
+ )
+ tmp.replace(dest)
+ return dest
+
+
+def ensure_kegg_data(
+ *,
+ version: str | None = None,
+ files: tuple[str, ...] = CORE_KEGG_FILES,
+ registry: dict | None = None,
+) -> Path:
+ """Ensure the core KEGG artefacts are cached; return their directory.
+
+ Fetches each of ``files`` (default :data:`CORE_KEGG_FILES`) for the ``kegg``
+ dataset and returns the cache directory holding them — ready to pass as the
+ ``artefact_dir`` of :func:`get_kegg_model_for_organism_from_artefacts`.
+ """
+ registry = _DATA_REGISTRY if registry is None else registry
+ ver = version or _bundle("kegg", registry)["version"]
+ for filename in files:
+ ensure_data_file("kegg", filename, version=ver, registry=registry)
+ return _data_cache_dir() / f"kegg-{ver}"
+
+
+def ensure_kegg_hmm_library(
+ domain: str, *, version: str | None = None, registry: dict | None = None
+) -> Path:
+ """Ensure a domain HMM library (and its hmmpress index) is cached; return its path.
+
+ ``domain`` is ``"prokaryotes"`` or ``"eukaryotes"``. Fetches ``.hmm``
+ plus the ``hmmpress`` sidecar files (``.h3f/.h3i/.h3m/.h3p``) and returns the
+ path to the ``.hmm`` (the argument for :func:`run_hmmscan`).
+ """
+ registry = _DATA_REGISTRY if registry is None else registry
+ ver = version or _bundle("kegg", registry)["version"]
+ base = f"{domain}.hmm"
+ library = ensure_data_file("kegg", base, version=ver, registry=registry)
+ for suffix in (".h3f", ".h3i", ".h3m", ".h3p"):
+ ensure_data_file("kegg", base + suffix, version=ver, registry=registry)
+ return library
diff --git a/src/raven_python/io/__init__.py b/src/raven_python/io/__init__.py
new file mode 100644
index 0000000..bc70511
--- /dev/null
+++ b/src/raven_python/io/__init__.py
@@ -0,0 +1,15 @@
+"""RAVEN-specific I/O: YAML (cobra + Metabolic Atlas / Human-GEM extensions), SIF,
+Excel export, and the Standard-GEM ``model//…`` git layout.
+"""
+from raven_python.io.excel import export_to_excel
+from raven_python.io.git import export_for_git
+from raven_python.io.sif import export_model_to_sif
+from raven_python.io.yaml import read_yaml_model, write_yaml_model
+
+__all__ = [
+ "export_for_git",
+ "export_model_to_sif",
+ "export_to_excel",
+ "read_yaml_model",
+ "write_yaml_model",
+]
diff --git a/src/raven_python/io/excel.py b/src/raven_python/io/excel.py
new file mode 100644
index 0000000..cf6196e
--- /dev/null
+++ b/src/raven_python/io/excel.py
@@ -0,0 +1,136 @@
+"""Export a model to the RAVEN Microsoft Excel format.
+
+Writes the five-sheet RAVEN xlsx layout — RXNS, METS, COMPS, GENES, MODEL — pulling
+RAVEN-specific values back out of cobra's ``annotation`` / ``notes`` (where the
+raven_python YAML reader stashes them). Excel *import* is intentionally not provided.
+
+Requires the optional ``openpyxl`` dependency (``pip install raven_python[excel]``).
+"""
+from __future__ import annotations
+
+from pathlib import Path
+
+import cobra
+
+
+def _miriam_string(annotation: dict, exclude: tuple[str, ...] = ()) -> str:
+ """RAVEN MIRIAM column: ``namespace/id;namespace/id2;...`` (sorted)."""
+ parts = []
+ for namespace in sorted(annotation):
+ if namespace in exclude:
+ continue
+ values = annotation[namespace]
+ if isinstance(values, str):
+ values = [values]
+ parts.extend(f"{namespace}/{value}" for value in values)
+ return ";".join(parts)
+
+
+def _equation(rxn: cobra.Reaction) -> str:
+ """Human-readable equation in RAVEN ``name[comp]`` form."""
+
+ def side(items):
+ return " + ".join(
+ f"{abs(coef):g} {met.name}[{met.compartment}]" for met, coef in items
+ )
+
+ reactants = [(m, c) for m, c in rxn.metabolites.items() if c < 0]
+ products = [(m, c) for m, c in rxn.metabolites.items() if c > 0]
+ arrow = " <=> " if rxn.reversibility else " => "
+ return f"{side(reactants)}{arrow}{side(products)}"
+
+
+def _ec_codes(rxn: cobra.Reaction) -> str:
+ codes = rxn.annotation.get("ec-code", [])
+ if isinstance(codes, str):
+ codes = [codes]
+ return ";".join(codes)
+
+
+def export_to_excel(
+ model: cobra.Model, path: str | Path, *, sort_ids: bool = False
+) -> None:
+ """Write ``model`` to a RAVEN-format ``.xlsx`` file.
+
+ Parameters
+ ----------
+ sort_ids
+ If True, write reactions/metabolites/genes sorted alphabetically by ID
+ (the model itself is not modified).
+ """
+ try:
+ from openpyxl import Workbook
+ except ImportError as exc: # pragma: no cover - exercised only without openpyxl
+ raise ImportError(
+ "export_to_excel requires openpyxl. Install it with "
+ "`pip install raven_python[excel]` (or `pip install openpyxl`)."
+ ) from exc
+
+ reactions = sorted(model.reactions, key=lambda r: r.id) if sort_ids else list(model.reactions)
+ metabolites = (
+ sorted(model.metabolites, key=lambda m: m.id) if sort_ids else list(model.metabolites)
+ )
+ genes = sorted(model.genes, key=lambda g: g.id) if sort_ids else list(model.genes)
+ metadata = dict(model.notes.get("metaData", {})) if model.notes else {}
+
+ wb = Workbook()
+ wb.remove(wb.active) # drop the default empty sheet
+
+ # --- RXNS ---
+ ws = wb.create_sheet("RXNS")
+ ws.append(
+ ["#", "ID", "NAME", "EQUATION", "EC-NUMBER", "GENE ASSOCIATION", "LOWER BOUND",
+ "UPPER BOUND", "OBJECTIVE", "COMPARTMENT", "MIRIAM", "SUBSYSTEM",
+ "REPLACEMENT ID", "NOTE", "REFERENCE", "CONFIDENCE SCORE"]
+ )
+ for r in reactions:
+ subsystem = r.subsystem
+ if isinstance(subsystem, (list, tuple)):
+ subsystem = ";".join(subsystem)
+ ws.append([
+ None, r.id, r.name, _equation(r), _ec_codes(r), r.gene_reaction_rule,
+ r.lower_bound, r.upper_bound,
+ r.objective_coefficient or None, None,
+ _miriam_string(r.annotation, exclude=("ec-code",)), subsystem, None,
+ r.notes.get("note"), r.notes.get("references"), r.notes.get("confidence_score"),
+ ])
+
+ # --- METS ---
+ ws = wb.create_sheet("METS")
+ ws.append(["#", "ID", "NAME", "UNCONSTRAINED", "MIRIAM", "COMPOSITION", "InChI",
+ "COMPARTMENT", "REPLACEMENT ID", "CHARGE"])
+ for m in metabolites:
+ inchi = m.notes.get("inchis")
+ ws.append([
+ None, f"{m.name}[{m.compartment}]", m.name, None,
+ _miriam_string(m.annotation, exclude=("smiles",)),
+ None if inchi else m.formula, inchi, m.compartment, m.id, m.charge,
+ ])
+
+ # --- COMPS ---
+ ws = wb.create_sheet("COMPS")
+ ws.append(["#", "ABBREVIATION", "NAME", "INSIDE", "MIRIAM"])
+ comps = sorted(model.compartments) if sort_ids else list(model.compartments)
+ for cid in comps:
+ ws.append([None, cid, model.compartments.get(cid, ""), None, None])
+
+ # --- GENES ---
+ if genes:
+ ws = wb.create_sheet("GENES")
+ ws.append(["#", "NAME", "MIRIAM", "SHORT NAME", "COMPARTMENT"])
+ for g in genes:
+ ws.append([None, g.id, _miriam_string(g.annotation), g.name, None])
+
+ # --- MODEL ---
+ ws = wb.create_sheet("MODEL")
+ ws.append(["#", "ID", "NAME", "TAXONOMY", "DEFAULT LOWER", "DEFAULT UPPER",
+ "CONTACT GIVEN NAME", "CONTACT FAMILY NAME", "CONTACT EMAIL",
+ "ORGANIZATION", "NOTES"])
+ ws.append([
+ None, model.id or "blankID", model.name or "blankName",
+ metadata.get("taxonomy"), metadata.get("defaultLB"), metadata.get("defaultUB"),
+ metadata.get("givenName"), metadata.get("familyName"), metadata.get("email"),
+ metadata.get("organization"), metadata.get("note"),
+ ])
+
+ wb.save(str(path))
diff --git a/src/raven_python/io/git.py b/src/raven_python/io/git.py
new file mode 100644
index 0000000..80bf8e8
--- /dev/null
+++ b/src/raven_python/io/git.py
@@ -0,0 +1,106 @@
+"""Export a model into a Standard-GEM versioned-repository layout.
+
+Writes the model in several formats into the Standard-GEM folder structure (a
+``model/`` directory with one subfolder per format), ready to commit to a
+Git-maintained model repository (Metabolic Atlas / Human-GEM / yeast-GEM style),
+plus a ``dependencies.txt`` recording tool versions.
+
+Thin orchestration over the writers raven_python already exposes: ``write_yaml_model``,
+cobra's ``write_sbml_model`` and ``save_matlab_model``, ``export_to_excel``, plus a
+single-file reaction table (txt).
+"""
+from __future__ import annotations
+
+import importlib.metadata as _md
+import platform
+from collections.abc import Iterable
+from pathlib import Path
+
+import cobra
+
+from raven_python.io.excel import _equation, export_to_excel
+from raven_python.io.yaml import write_yaml_model
+from raven_python.utils.sort import sort_identifiers
+
+_ALL_FORMATS = ("yml", "xml", "mat", "xlsx", "txt")
+
+
+def _version(package: str) -> str:
+ try:
+ return _md.version(package)
+ except _md.PackageNotFoundError:
+ return "unknown"
+
+
+def _write_txt(model: cobra.Model, path: Path) -> None:
+ """Single-file, human-readable reaction table (RAVEN exportForGit txt)."""
+ with open(path, "w", encoding="utf-8") as fh:
+ fh.write("Rxn name\tFormula\tGene-reaction association\tLB\tUB\tObjective\n")
+ for r in model.reactions:
+ fh.write(
+ f"{r.id}\t{_equation(r)}\t{r.gene_reaction_rule}\t"
+ f"{r.lower_bound:g}\t{r.upper_bound:g}\t{r.objective_coefficient:g}\n"
+ )
+
+
+def export_for_git(
+ model: cobra.Model,
+ path: str | Path = ".",
+ *,
+ prefix: str = "model",
+ formats: Iterable[str] = ("yml", "xml", "mat", "xlsx"),
+ sub_dirs: bool = True,
+) -> Path:
+ """Write ``model`` into a Standard-GEM repository layout.
+
+ Parameters
+ ----------
+ path
+ Directory to populate.
+ prefix
+ Base filename for every format (default ``"model"``).
+ formats
+ Which formats to write; any of ``"yml"``, ``"xml"``, ``"mat"``,
+ ``"xlsx"``, ``"txt"`` (default ``yml``/``xml``/``mat``/``xlsx``).
+ sub_dirs
+ If True (default), write ``model//.`` (standard-GEM
+ layout); otherwise all files go directly in ``path``.
+
+ Returns
+ -------
+ pathlib.Path
+ The root directory written to.
+ """
+ formats = list(formats)
+ unknown = set(formats) - set(_ALL_FORMATS)
+ if unknown:
+ raise ValueError(f"Unknown format(s): {sorted(unknown)}; allowed: {_ALL_FORMATS}")
+
+ # Sort a copy so the caller's model is untouched.
+ model = sort_identifiers(model.copy())
+
+ root = Path(path) / "model" if sub_dirs else Path(path)
+ root.mkdir(parents=True, exist_ok=True)
+
+ def target(fmt: str) -> Path:
+ folder = root / fmt if sub_dirs else root
+ folder.mkdir(parents=True, exist_ok=True)
+ return folder / f"{prefix}.{fmt}"
+
+ if "yml" in formats:
+ write_yaml_model(model, target("yml"))
+ if "xml" in formats:
+ cobra.io.write_sbml_model(model, str(target("xml")))
+ if "mat" in formats:
+ cobra.io.save_matlab_model(model, str(target("mat")))
+ if "xlsx" in formats:
+ export_to_excel(model, target("xlsx"))
+ if "txt" in formats:
+ _write_txt(model, target("txt"))
+
+ with open(root / "dependencies.txt", "w", encoding="utf-8") as fh:
+ fh.write(f"python\t{platform.python_version()}\n")
+ fh.write(f"cobra\t{_version('cobra')}\n")
+ fh.write(f"raven_python\t{_version('raven_python')}\n")
+
+ return root
diff --git a/src/raven_python/io/sif.py b/src/raven_python/io/sif.py
new file mode 100644
index 0000000..9e73efa
--- /dev/null
+++ b/src/raven_python/io/sif.py
@@ -0,0 +1,96 @@
+"""Export a model to Cytoscape SIF (Simple Interaction Format).
+
+Three graph types are supported:
+
+* ``"rc"`` reaction–compound: each reaction linked to its metabolites;
+* ``"rr"`` reaction–reaction: reactions linked when they share a metabolite;
+* ``"cc"`` compound–compound: each substrate linked to the products of the
+ reactions it feeds (computed on an irreversible copy, as RAVEN does, to avoid
+ spurious double links from reversible reactions).
+
+A SIF line is ``source graph_type target1 target2 ...``.
+"""
+from __future__ import annotations
+
+import warnings
+from collections import Counter
+from collections.abc import Mapping
+from pathlib import Path
+
+import cobra
+
+from raven_python.manipulation.irreversible import convert_to_irreversible
+
+_GRAPH_TYPES = ("rc", "rr", "cc")
+
+
+def _edges(model, graph_type):
+ """Yield (source_object, [target_objects]) per the graph type."""
+ if graph_type == "rc":
+ for rxn in model.reactions:
+ yield rxn, list(rxn.metabolites)
+ elif graph_type == "rr":
+ for rxn in model.reactions:
+ neighbours = {r for met in rxn.metabolites for r in met.reactions}
+ neighbours.discard(rxn)
+ yield rxn, list(neighbours)
+ else: # cc — on an irreversible copy
+ irrev = model.copy()
+ convert_to_irreversible(irrev)
+ for met in irrev.metabolites:
+ products: set = set()
+ for rxn in met.reactions:
+ if rxn.get_coefficient(met) < 0: # met is a substrate here
+ products.update(m for m, c in rxn.metabolites.items() if c > 0)
+ yield met, list(products)
+
+
+def export_model_to_sif(
+ model: cobra.Model,
+ path: str | Path,
+ graph_type: str = "rc",
+ *,
+ reaction_labels: Mapping[str, str] | None = None,
+ metabolite_labels: Mapping[str, str] | None = None,
+) -> None:
+ """Write ``model`` to a Cytoscape SIF file.
+
+ Parameters
+ ----------
+ graph_type
+ ``"rc"`` (reaction–compound, default), ``"rr"`` (reaction–reaction), or
+ ``"cc"`` (compound–compound).
+ reaction_labels, metabolite_labels
+ Optional ``{id: label}`` maps overriding the node labels (default: IDs).
+ """
+ if graph_type not in _GRAPH_TYPES:
+ raise ValueError(f"graph_type must be one of {_GRAPH_TYPES}, got {graph_type!r}")
+
+ rlabels = reaction_labels or {}
+ mlabels = metabolite_labels or {}
+
+ # Warn when the label maps collapse multiple distinct ids onto the same
+ # label: target-side dedup runs on labels, so the collision silently merges
+ # two nodes into one edge. Only check the ids actually mapped (cobra default
+ # labels are ids, which can't collide).
+ for kind, lmap in (("reaction", rlabels), ("metabolite", mlabels)):
+ duplicates = [lab for lab, n in Counter(lmap.values()).items() if n > 1]
+ if duplicates:
+ warnings.warn(
+ f"{kind}_labels maps multiple ids to the same label(s) "
+ f"({duplicates[:5]}{'…' if len(duplicates) > 5 else ''}); "
+ "SIF nodes are keyed by label, so those nodes will collapse.",
+ stacklevel=2,
+ )
+
+ def label(obj) -> str:
+ if isinstance(obj, cobra.Reaction):
+ return rlabels.get(obj.id, obj.id)
+ return mlabels.get(obj.id, obj.id)
+
+ with open(path, "w", encoding="utf-8") as handle:
+ for source, targets in _edges(model, graph_type):
+ src = label(source)
+ names = sorted({label(t) for t in targets} - {src})
+ if names:
+ handle.write(f"{src}\t{graph_type}\t" + "\t".join(names) + "\n")
diff --git a/src/raven_python/io/yaml.py b/src/raven_python/io/yaml.py
new file mode 100644
index 0000000..151954b
--- /dev/null
+++ b/src/raven_python/io/yaml.py
@@ -0,0 +1,191 @@
+"""Read and write RAVEN/cobrapy YAML models.
+
+Aligned to RAVEN ``writeYAMLmodel.m`` / ``readYAMLmodel.m`` as of the
+``feat/geckopy-compat-yaml`` work (commit fa281a1), whose writer emits **cobra's
+native ``!!omap`` YAML**. Because the format *is* cobra's, the standard model
+content — id, name, compartments, and per-entry id/name/compartment/formula/
+charge/bounds/gene_reaction_rule/objective_coefficient/subsystem/metabolites and
+the whole ``annotation`` block (which carries ``smiles`` for metabolites,
+``ec-code`` for reactions, and all MIRIAM cross-references) — is read and written
+by ``cobra.io`` directly.
+
+This module only handles what cobra drops or mishandles:
+
+* **RAVEN-only top-level per-entry keys** that cobra ignores: ``inchis``,
+ ``deltaG``, ``metFrom`` and the free-text ``notes`` (metNotes) on metabolites;
+ ``confidence_score``, ``references``, ``rxnFrom``, ``deltaG`` and ``notes``
+ (rxnNotes) on reactions; ``protein`` on genes. These are stashed in the cobra
+ object's ``.notes`` dict on read and lifted back to top-level keys on write.
+* **Model-level extras** cobra ignores: ``version``, the ``metaData`` provenance
+ block, and the GECKO sections (``gecko_light``/``ec-rxns``/``ec-enzymes``),
+ preserved on ``model.notes`` for round-tripping.
+
+The reader also accepts the older RAVEN files (id/name nested in ``metaData``).
+"""
+from __future__ import annotations
+
+import gzip
+from collections import OrderedDict
+from pathlib import Path
+
+import cobra
+from cobra.io.dict import model_from_dict, model_to_dict
+from cobra.io.yaml import yaml as _cobra_yaml # ruamel round-trip YAML (handles !!omap)
+
+
+def _open_text(path: str | Path, mode: str):
+ """Open ``path`` as a text handle, transparently gzipping when it ends ``.gz``."""
+ if str(path).endswith(".gz"):
+ return gzip.open(path, f"{mode}t", encoding="utf-8")
+ return open(path, mode, encoding="utf-8")
+
+# RAVEN-only top-level per-entry keys -> the key used inside the cobra object's
+# .notes dict. ('notes' is RAVEN's free-text metNotes/rxnNotes; stored under
+# 'note' to avoid colliding with the notes container itself.)
+_MET_FIELDS = (("inchis", "inchis"), ("deltaG", "deltaG"), ("metFrom", "metFrom"), ("notes", "note"))
+_RXN_FIELDS = (
+ ("confidence_score", "confidence_score"),
+ ("references", "references"),
+ ("rxnFrom", "rxnFrom"),
+ ("deltaG", "deltaG"),
+ ("notes", "note"),
+)
+_GENE_FIELDS = (("protein", "protein"),)
+
+_COBRA_TOP_KEYS = frozenset({"metabolites", "reactions", "genes", "compartments", "id", "name"})
+
+
+def _to_plain(obj):
+ if isinstance(obj, dict):
+ return {str(k): _to_plain(v) for k, v in obj.items()}
+ if isinstance(obj, (list, tuple)):
+ return [_to_plain(v) for v in obj]
+ if isinstance(obj, bool) or obj is None:
+ return obj
+ if isinstance(obj, int):
+ return int(obj)
+ if isinstance(obj, float):
+ return float(obj)
+ return obj if isinstance(obj, str) else str(obj)
+
+
+def _capture_entry_fields(entries, fields):
+ """Pop RAVEN-only top-level keys off each entry into a parallel notes dict.
+
+ Returns a list of ``{notes_key: value}`` dicts aligned with ``entries`` (so
+ cobra never sees these keys), to be attached to the built objects afterwards.
+ """
+ captured = []
+ for entry in entries:
+ notes = {}
+ for yaml_key, notes_key in fields:
+ if yaml_key in entry:
+ notes[notes_key] = entry.pop(yaml_key)
+ captured.append(notes)
+ return captured
+
+
+def read_yaml_model(path: str | Path) -> cobra.Model:
+ """Read a RAVEN/cobrapy YAML model into a ``cobra.Model``."""
+ with _open_text(path, "r") as handle:
+ raw = _to_plain(_cobra_yaml.load(handle))
+
+ if not isinstance(raw, dict):
+ raise ValueError(f"{path}: top-level YAML is a {type(raw).__name__}, not a mapping.")
+
+ metadata = raw.pop("metaData", None) or {}
+ version = raw.pop("version", None)
+ foreign = {k: raw.pop(k) for k in list(raw) if k not in _COBRA_TOP_KEYS}
+
+ met_notes = _capture_entry_fields(raw.get("metabolites", []), _MET_FIELDS)
+ rxn_notes = _capture_entry_fields(raw.get("reactions", []), _RXN_FIELDS)
+ gene_notes = _capture_entry_fields(raw.get("genes", []), _GENE_FIELDS)
+
+ model = model_from_dict(raw)
+
+ for met, notes in zip(model.metabolites, met_notes, strict=False):
+ met.notes = notes
+ for rxn, notes in zip(model.reactions, rxn_notes, strict=False):
+ rxn.notes = notes
+ for gene, notes in zip(model.genes, gene_notes, strict=False):
+ gene.notes = notes
+
+ # Legacy files keep id/name inside metaData; restore them if cobra found none.
+ if metadata.get("id") and not model.id:
+ model.id = metadata["id"]
+ if metadata.get("name") and not model.name:
+ model.name = metadata["name"]
+ if metadata:
+ model.notes["metaData"] = metadata
+ if version is not None:
+ model.notes["version"] = version
+ if foreign:
+ model.notes["_yaml_sections"] = foreign
+
+ return model
+
+
+def _emit_entry_fields(entries, fields):
+ """Lift RAVEN-only keys out of each entry's ``notes`` dict to top level."""
+ for entry in entries:
+ notes = entry.pop("notes", None)
+ if not isinstance(notes, dict):
+ continue
+ notes = dict(notes)
+ for yaml_key, notes_key in fields:
+ if notes_key in notes:
+ entry[yaml_key] = notes.pop(notes_key)
+ # Preserve any remaining (non-RAVEN) notes. The RAVEN free-text note is lifted
+ # to the YAML key "notes"; if leftovers also exist, merge them with it under
+ # that key (rather than silently dropping the leftovers).
+ if notes:
+ if "notes" in entry:
+ notes["note"] = entry["notes"]
+ entry["notes"] = notes
+
+
+def write_yaml_model(
+ model: cobra.Model, path: str | Path, *, sort_ids: bool = False
+) -> None:
+ """Write a ``cobra.Model`` to RAVEN/cobrapy (``!!omap``) YAML.
+
+ With ``sort_ids=True`` metabolites/reactions/genes/compartments are written
+ in alphabetical order (diff-friendly), without modifying ``model``.
+ """
+ model_notes = dict(model.notes or {})
+ stored_meta = model_notes.pop("metaData", None) or {}
+ version = model_notes.pop("version", None)
+ foreign = model_notes.pop("_yaml_sections", None) or {}
+
+ doc = OrderedDict(_to_plain(model_to_dict(model)))
+
+ if sort_ids:
+ for section in ("metabolites", "reactions", "genes"):
+ if section in doc:
+ doc[section] = sorted(doc[section], key=lambda e: e.get("id", ""))
+ if isinstance(doc.get("compartments"), dict):
+ doc["compartments"] = dict(sorted(doc["compartments"].items()))
+
+ _emit_entry_fields(doc.get("metabolites", []), _MET_FIELDS)
+ _emit_entry_fields(doc.get("reactions", []), _RXN_FIELDS)
+ _emit_entry_fields(doc.get("genes", []), _GENE_FIELDS)
+
+ # cobra dict order is metabolites, reactions, genes, id, name, compartments;
+ # append version / gecko_light / metaData / ec-* like RAVEN's writer.
+ if version is not None:
+ doc["version"] = version
+ metadata = dict(stored_meta)
+ if model.id:
+ metadata.setdefault("id", model.id)
+ if model.name:
+ metadata.setdefault("name", model.name)
+ for key in ("gecko_light",):
+ if key in foreign:
+ doc[key] = foreign.pop(key)
+ if metadata:
+ doc["metaData"] = metadata
+ for key, value in foreign.items():
+ doc[key] = value
+
+ with _open_text(path, "w") as handle:
+ _cobra_yaml.dump(doc, handle)
diff --git a/src/raven_python/manipulation/__init__.py b/src/raven_python/manipulation/__init__.py
new file mode 100644
index 0000000..074c36f
--- /dev/null
+++ b/src/raven_python/manipulation/__init__.py
@@ -0,0 +1,36 @@
+"""Generic cobra.Model structural transforms that cobrapy does not cover cleanly:
+reaction building from equations, batch GPR / bound changes, irreversibility splitting,
+isozyme expansion, compartment merge / copy, and model merging by name."""
+from .add import add_reactions_from_equations
+from .change import change_gene_reaction_rules, change_reaction_equations
+from .expand import expand_model
+from .irreversible import convert_to_irreversible
+from .merge import merge_models
+from .parameters import set_variance_bounds
+from .remove import remove_genes, remove_metabolites
+from .simplify import (
+ constrain_reversible_reactions,
+ group_linear_reactions,
+ remove_dead_end_reactions,
+ remove_duplicate_reactions,
+)
+from .transfer import add_reactions_from_model
+from .transport import add_transport_reactions
+
+__all__ = [
+ "add_reactions_from_equations",
+ "add_reactions_from_model",
+ "add_transport_reactions",
+ "change_gene_reaction_rules",
+ "change_reaction_equations",
+ "constrain_reversible_reactions",
+ "convert_to_irreversible",
+ "expand_model",
+ "group_linear_reactions",
+ "merge_models",
+ "remove_dead_end_reactions",
+ "remove_duplicate_reactions",
+ "remove_genes",
+ "remove_metabolites",
+ "set_variance_bounds",
+]
diff --git a/src/raven_python/manipulation/add.py b/src/raven_python/manipulation/add.py
new file mode 100644
index 0000000..3842297
--- /dev/null
+++ b/src/raven_python/manipulation/add.py
@@ -0,0 +1,345 @@
+"""Add reactions to a model from equation strings.
+
+Most of the equivalent MATLAB code is struct-of-arrays bookkeeping (padding parallel
+``rxnNames`` / ``lb`` / ``ub`` / ``grRules`` / ... fields) that does not exist in
+cobra, where each ``Reaction`` carries its own attributes. cobra also already
+covers a large part of the *behaviour*:
+
+* ``Reaction.build_reaction_from_string`` parses equation strings, coefficients,
+ and reversibility arrows (``<=>``, ``-->``, ``=>``) and creates unknown
+ metabolites — but only matching metabolites **by ID**, and it leaves new
+ metabolites with ``compartment=None``.
+* assigning ``reaction.gene_reaction_rule`` auto-creates ``Gene`` objects.
+
+So this port keeps only the parts cobra lacks:
+
+* **name-based matching** — interpret equation tokens as metabolite *names*
+ (RAVEN eqnType 2) or as ``name[comp]`` (eqnType 3), not just IDs;
+* **correct compartment** assignment for newly created metabolites;
+* **strict policies** — optionally *error* (rather than silently create) on
+ unknown metabolites or genes, and always error on a duplicate reaction ID
+ (cobra silently ignores those).
+
+Instead of RAVEN's ``eqnType`` integer (1/2/3) the matching mode is a readable
+keyword: ``mets_by="id"`` or ``mets_by="name"``, with ``name[comp]`` recognised
+automatically. See IMPROVEMENTS.md (A-series) for the rationale.
+"""
+from __future__ import annotations
+
+import re
+import warnings
+from collections import OrderedDict
+from collections.abc import Mapping, Sequence
+
+import cobra
+from cobra import Metabolite, Reaction
+from cobra.core.gene import GPR
+
+from raven_python.utils.parse import parse_name_comp
+
+# Reversibility arrows. ``<=>`` must be tried before ``=>`` (it contains it).
+_REVERSIBLE_ARROWS = ("<=>",)
+_FORWARD_ARROWS = ("-->", "->", "=>")
+
+
+def _split_equation(equation: str) -> tuple[str, str, bool]:
+ """Split an equation into (lhs, rhs, reversible) on its arrow."""
+ for arrow in _REVERSIBLE_ARROWS:
+ if arrow in equation:
+ lhs, rhs = equation.split(arrow, 1)
+ return lhs, rhs, True
+ for arrow in _FORWARD_ARROWS:
+ if arrow in equation:
+ lhs, rhs = equation.split(arrow, 1)
+ return lhs, rhs, False
+ raise ValueError(f"No reaction arrow (<=>, -->, =>) found in equation: {equation!r}")
+
+
+def _parse_side(side: str) -> list[tuple[float, str, str | None]]:
+ """Parse one side of an equation into ``[(coefficient, token, fallback), ...]``.
+
+ The ``fallback`` slot is for the ambiguous ``" "`` shape: when
+ matching by name, ``"2 oxoglutarate"`` could be either ``coeff=2, name="oxoglutarate"``
+ or ``coeff=1, name="2 oxoglutarate"`` (a real chemistry name). We return the
+ coefficient-split form as the primary and the full term as the fallback; the
+ resolver picks whichever matches an existing metabolite. Pure-number heads
+ with no name (``"2"``) and pure-name terms (``"glucose"``) have no fallback.
+ """
+ terms: list[tuple[float, str, str | None]] = []
+ for raw in side.split(" + "):
+ term = raw.strip()
+ if not term:
+ continue
+ head, _, tail = term.partition(" ")
+ try:
+ coeff = float(head)
+ token = tail.strip()
+ except ValueError:
+ coeff, token = 1.0, term
+ fallback = None
+ else:
+ # Coefficient-split succeeded. Keep the full term as a fallback when
+ # the tail is non-empty so name-resolution can re-try it as one token.
+ fallback = term if token else None
+ if not token:
+ raise ValueError(f"Missing metabolite after coefficient in term: {raw!r}")
+ terms.append((coeff, token, fallback))
+ return terms
+
+
+def _new_met_id(model: cobra.Model, prefix: str) -> str:
+ """Next free ```` metabolite ID (RAVEN m1, m2, ... scheme)."""
+ pattern = re.compile(rf"^{re.escape(prefix)}(\d+)$")
+ used = [int(m.group(1)) for met in model.metabolites if (m := pattern.match(met.id))]
+ n = max(used) + 1 if used else 1
+ while f"{prefix}{n}" in model.metabolites:
+ n += 1
+ return f"{prefix}{n}"
+
+
+def _try_existing(
+ model: cobra.Model, token: str, *, mets_by: str, compartment: str | None
+) -> Metabolite | None:
+ """Look up ``token`` as an existing metabolite (no creation, no side effects).
+
+ Returns the matching metabolite or ``None``. Used by ``_stoichiometry`` to
+ disambiguate the ``" "`` shape: if a metabolite whose *name*
+ (or id) literally contains a leading number exists, prefer it over splitting
+ the number off as a coefficient.
+ """
+ name, comp = parse_name_comp(token)
+ if mets_by == "id" and comp is None:
+ return model.metabolites.get_by_id(token) if token in model.metabolites else None
+ target_comp = comp if comp is not None else compartment
+ if target_comp is None:
+ return None
+ for met in model.metabolites:
+ if met.name == name and met.compartment == target_comp:
+ return met
+ return None
+
+
+def _resolve_metabolite(
+ model: cobra.Model,
+ token: str,
+ *,
+ mets_by: str,
+ compartment: str | None,
+ allow_new_mets: bool,
+ new_met_prefix: str,
+) -> Metabolite:
+ """Resolve an equation token to an existing or newly created Metabolite."""
+ name, comp = parse_name_comp(token)
+
+ if mets_by == "id" and comp is None:
+ # token is a metabolite ID
+ if token in model.metabolites:
+ return model.metabolites.get_by_id(token)
+ if not allow_new_mets:
+ raise ValueError(
+ f"Unknown metabolite ID {token!r}; pass allow_new_mets=True to create it."
+ )
+ if compartment is None:
+ raise ValueError(
+ f"Cannot create metabolite {token!r}: no compartment given."
+ )
+ _warn_unknown_compartment(model, compartment, token)
+ met = Metabolite(token, compartment=compartment)
+ model.add_metabolites([met])
+ return met
+
+ # name-based (mets_by="name") or explicit name[comp]
+ target_comp = comp if comp is not None else compartment
+ if target_comp is None:
+ raise ValueError(
+ f"Metabolite {token!r} matched by name needs a compartment; "
+ "pass compartment=... or use the name[comp] syntax."
+ )
+ if comp is not None and target_comp not in model.compartments and not allow_new_mets:
+ raise ValueError(f"Compartment {target_comp!r} is not in the model.")
+
+ matches = [
+ met
+ for met in model.metabolites
+ if met.name == name and met.compartment == target_comp
+ ]
+ if matches:
+ return matches[0]
+ if not allow_new_mets:
+ raise ValueError(
+ f"No metabolite named {name!r} in compartment {target_comp!r}; "
+ "pass allow_new_mets=True to create it."
+ )
+ _warn_unknown_compartment(model, target_comp, name)
+ met = Metabolite(_new_met_id(model, new_met_prefix), name=name, compartment=target_comp)
+ model.add_metabolites([met])
+ return met
+
+
+def _warn_unknown_compartment(model: cobra.Model, compartment: str, identifier: str) -> None:
+ """Warn when a new metabolite would be born into a not-yet-registered compartment.
+
+ Both ``mets_by`` paths previously created the metabolite without validating
+ the compartment, so a typo (``"cyto"`` for ``"c"``) silently produced a
+ one-metabolite ghost compartment. cobra inherits the compartment from the
+ first metabolite assigned to it, so the fix is a warning, not a hard error.
+ """
+ known = set(model.compartments) | set(model._compartments)
+ if compartment not in known:
+ warnings.warn(
+ f"Creating metabolite {identifier!r} in unregistered compartment "
+ f"{compartment!r} (existing: {sorted(known) or 'none'}); "
+ "add the compartment first or check for a typo.",
+ stacklevel=5,
+ )
+
+
+def _stoichiometry(
+ model: cobra.Model,
+ equation: str,
+ *,
+ mets_by: str,
+ compartment: str | None,
+ allow_new_mets: bool,
+ new_met_prefix: str,
+) -> tuple[dict[Metabolite, float], bool]:
+ """Parse an equation into a {Metabolite: net coefficient} dict + reversibility."""
+ lhs, rhs, reversible = _split_equation(equation)
+ coeffs: OrderedDict[Metabolite, float] = OrderedDict()
+ had_terms = False
+ for sign, side in ((-1.0, lhs), (1.0, rhs)):
+ for coeff, token, fallback in _parse_side(side):
+ had_terms = True
+ # " " is ambiguous when the name itself starts with a
+ # number (e.g. "2 oxoglutarate"). Prefer the full-term interpretation
+ # when it matches an existing metabolite — otherwise fall through to
+ # the coefficient-split form.
+ met = None
+ if fallback is not None:
+ met = _try_existing(
+ model, fallback, mets_by=mets_by, compartment=compartment
+ )
+ if met is not None:
+ coeff = 1.0
+ if met is None:
+ met = _resolve_metabolite(
+ model,
+ token,
+ mets_by=mets_by,
+ compartment=compartment,
+ allow_new_mets=allow_new_mets,
+ new_met_prefix=new_met_prefix,
+ )
+ coeffs[met] = coeffs.get(met, 0.0) + sign * coeff
+ # Drop metabolites that net to zero (present as both substrate and product).
+ coeffs = OrderedDict((met, c) for met, c in coeffs.items() if c != 0.0)
+ if had_terms and not coeffs:
+ warnings.warn(
+ f"Equation {equation!r} has no net metabolites (all terms cancelled); "
+ "the reaction will be added with empty stoichiometry.",
+ stacklevel=4,
+ )
+ return dict(coeffs), reversible
+
+
+def add_reactions_from_equations(
+ model: cobra.Model,
+ reactions: Sequence[Mapping],
+ *,
+ mets_by: str = "id",
+ compartment: str | None = None,
+ allow_new_mets: bool = True,
+ allow_new_genes: bool = True,
+ new_met_prefix: str = "m",
+) -> list[Reaction]:
+ """Add reactions defined by equation strings, matching mets by ID or name.
+ Parameters
+ ----------
+ model
+ Target ``cobra.Model``, mutated in place.
+ reactions
+ Sequence of mappings, one per reaction. Recognised keys:
+
+ * ``id`` (**required**) — reaction ID; must not already exist.
+ * ``equation`` (**required**) — e.g. ``"atp_c + h2o_c <=> adp_c + pi_c"``.
+ Use ``<=>`` for reversible, ``-->``/``->``/``=>`` for irreversible.
+ * ``name`` — reaction name.
+ * ``bounds`` — ``(lower, upper)`` tuple; overrides the arrow.
+ * ``gene_reaction_rule`` — GPR string.
+ * ``subsystem`` — subsystem name.
+ mets_by
+ How bare equation tokens (without ``[comp]``) are matched:
+ ``"id"`` (RAVEN eqnType 1) or ``"name"`` (eqnType 2). A ``name[comp]``
+ token (eqnType 3) is always matched by name + compartment.
+ compartment
+ Default compartment for new metabolites and for name-matched tokens
+ without an explicit ``[comp]``.
+ allow_new_mets
+ If True (default), create metabolites not found. New metabolites get
+ ``compartment`` (id mode) or an auto ID ``m1``, ``m2``, ... (name mode).
+ If False, an unknown metabolite raises.
+ allow_new_genes
+ If True (default), genes in a GPR are auto-created by cobra. If False,
+ a GPR referencing a gene not already in the model raises.
+ new_met_prefix
+ Prefix for auto-generated metabolite IDs in name mode (default ``"m"``).
+
+ Returns
+ -------
+ list of cobra.Reaction
+ The reactions added, in input order.
+ """
+ if mets_by not in ("id", "name"):
+ raise ValueError(f"mets_by must be 'id' or 'name', got {mets_by!r}")
+
+ known_genes = {gene.id for gene in model.genes}
+ added: list[Reaction] = []
+
+ for spec in reactions:
+ if "id" not in spec:
+ raise ValueError(f"Reaction spec missing required 'id': {spec!r}")
+ rxn_id = spec["id"]
+ if rxn_id in model.reactions:
+ raise ValueError(
+ f"Reaction {rxn_id!r} already exists; use changeRxns or remove it first."
+ )
+ if "equation" not in spec:
+ raise ValueError(f"Reaction {rxn_id!r} spec missing required 'equation'.")
+
+ coeffs, reversible = _stoichiometry(
+ model,
+ spec["equation"],
+ mets_by=mets_by,
+ compartment=compartment,
+ allow_new_mets=allow_new_mets,
+ new_met_prefix=new_met_prefix,
+ )
+
+ rxn = Reaction(rxn_id, name=spec.get("name", ""))
+ if "bounds" in spec:
+ rxn.bounds = tuple(spec["bounds"])
+ else:
+ config = cobra.Configuration()
+ lower = config.lower_bound if reversible else 0.0
+ rxn.bounds = (lower, config.upper_bound)
+ if "subsystem" in spec:
+ rxn.subsystem = spec["subsystem"]
+
+ model.add_reactions([rxn])
+ rxn.add_metabolites(coeffs)
+
+ rule = spec.get("gene_reaction_rule", "")
+ if rule:
+ if not allow_new_genes:
+ missing = sorted(set(GPR.from_string(rule).genes) - known_genes)
+ if missing:
+ raise ValueError(
+ f"Reaction {rxn_id!r} references genes not in the model: "
+ f"{missing}. Set allow_new_genes=True or add them first."
+ )
+ rxn.gene_reaction_rule = rule
+ known_genes.update(gene.id for gene in rxn.genes)
+
+ added.append(rxn)
+
+ return added
diff --git a/src/raven_python/manipulation/change.py b/src/raven_python/manipulation/change.py
new file mode 100644
index 0000000..78612ba
--- /dev/null
+++ b/src/raven_python/manipulation/change.py
@@ -0,0 +1,125 @@
+"""Change the stoichiometry of existing reactions from equation strings.
+
+Editing the same ``Reaction`` object changes only its stoichiometry — its id, name,
+bounds, GPR, subsystem, and position are preserved automatically by cobra.
+
+So this port simply re-parses the equation (reusing the same metabolite
+matching as :func:`~raven_python.manipulation.add.add_reactions_from_equations`,
+including name and ``name[comp]`` modes that cobra lacks) and swaps the
+metabolites in place.
+
+Like RAVEN, **bounds are left unchanged** even if the new equation's arrow
+implies a different reversibility — use a bounds setter for that.
+"""
+from __future__ import annotations
+
+from collections.abc import Mapping
+
+import cobra
+from cobra import Reaction
+
+from raven_python.manipulation.add import _stoichiometry
+
+__all__ = ["change_reaction_equations", "change_gene_reaction_rules"]
+
+
+def change_reaction_equations(
+ model: cobra.Model,
+ equations: Mapping[str, str],
+ *,
+ mets_by: str = "id",
+ compartment: str | None = None,
+ allow_new_mets: bool = True,
+ new_met_prefix: str = "m",
+) -> list[Reaction]:
+ """Replace the stoichiometry of existing reactions.
+ Parameters
+ ----------
+ model
+ Target ``cobra.Model``, mutated in place.
+ equations
+ Mapping of ``reaction_id -> equation string``. Every ID must already
+ exist in the model. Equation syntax is identical to
+ :func:`~raven_python.manipulation.add.add_reactions_from_equations`.
+ mets_by, compartment, allow_new_mets, new_met_prefix
+ Metabolite-matching options, as in ``add_reactions_from_equations``.
+
+ Returns
+ -------
+ list of cobra.Reaction
+ The reactions changed, in input order.
+
+ Notes
+ -----
+ Bounds are **not** modified, matching RAVEN. Changing an equation from
+ ``-->`` to ``<=>`` does not by itself make the reaction reversible; adjust
+ the bounds separately.
+ """
+ if mets_by not in ("id", "name"):
+ raise ValueError(f"mets_by must be 'id' or 'name', got {mets_by!r}")
+
+ changed: list[Reaction] = []
+ for rxn_id, equation in equations.items():
+ if rxn_id not in model.reactions:
+ raise ValueError(f"Reaction {rxn_id!r} not found in the model.")
+ rxn = model.reactions.get_by_id(rxn_id)
+
+ coeffs, _reversible = _stoichiometry(
+ model,
+ equation,
+ mets_by=mets_by,
+ compartment=compartment,
+ allow_new_mets=allow_new_mets,
+ new_met_prefix=new_met_prefix,
+ )
+
+ rxn.subtract_metabolites(dict(rxn.metabolites), combine=True)
+ rxn.add_metabolites(coeffs)
+ changed.append(rxn)
+
+ return changed
+
+
+def change_gene_reaction_rules(
+ model: cobra.Model,
+ rules: Mapping[str, str],
+ *,
+ replace: bool = True,
+) -> list[Reaction]:
+ """Set or append gene-reaction rules on existing reactions.
+ cobra already does the heavy lifting on assignment to
+ ``reaction.gene_reaction_rule``: it auto-creates any new ``Gene`` objects and
+ normalises the rule. So the value here is batching plus RAVEN's ``replace``
+ option to **append** rather than overwrite.
+
+ Parameters
+ ----------
+ model
+ Target ``cobra.Model``, mutated in place.
+ rules
+ Mapping of ``reaction_id -> GPR string``. Every ID must already exist.
+ replace
+ If True (default), overwrite the existing GPR. If False, append the new
+ rule as an isozyme: ``(old) or (new)`` (just ``new`` if the reaction had
+ no GPR).
+
+ Returns
+ -------
+ list of cobra.Reaction
+ The reactions changed, in input order.
+ """
+ changed: list[Reaction] = []
+ for rxn_id, rule in rules.items():
+ if rxn_id not in model.reactions:
+ raise ValueError(f"Reaction {rxn_id!r} not found in the model.")
+ rxn = model.reactions.get_by_id(rxn_id)
+
+ if replace or not rxn.gene_reaction_rule:
+ new_rule = rule
+ else:
+ new_rule = f"({rxn.gene_reaction_rule}) or ({rule})"
+
+ rxn.gene_reaction_rule = new_rule # cobra creates genes + normalises
+ changed.append(rxn)
+
+ return changed
diff --git a/src/raven_python/manipulation/compartments.py b/src/raven_python/manipulation/compartments.py
new file mode 100644
index 0000000..091d196
--- /dev/null
+++ b/src/raven_python/manipulation/compartments.py
@@ -0,0 +1,196 @@
+"""Compartment manipulation — merge all compartments into one, or copy reactions to a
+new compartment (ports of RAVEN's ``mergeCompartments`` and ``copyToComps``).
+
+Both functions are useful **independently of** :func:`raven_python.localization.predict_localization`:
+``merge_compartments`` flattens a multi-compartment model for a simplified analysis
+(e.g. checking whether the network can in principle make a metabolite, with no
+compartment topology in the way); ``copy_to_compartment`` is a building block for
+constructing dual-localised pathways. cobra has no equivalents.
+"""
+from __future__ import annotations
+
+from collections.abc import Iterable
+
+import cobra
+
+# Compartments produced by merge_compartments (RAVEN uses 's' for "system").
+_MERGED_COMPARTMENT = "s"
+
+
+def merge_compartments(
+ model: cobra.Model,
+ *,
+ merged_id: str = _MERGED_COMPARTMENT,
+ merged_name: str = "system",
+ drop_single_metabolite_reactions: bool = True,
+ deduplicate_reactions: bool = True,
+) -> tuple[cobra.Model, list[str], list[str]]:
+ """Merge every metabolite of ``model`` into one ``merged_id`` compartment.
+
+ Returns ``(model_copy, deleted_single_met_reactions, deduplicated_reactions)``. The
+ returned model is a deep copy of the input. Use cases:
+
+ * Check whether the network can produce/consume a metabolite at all (compartment
+ topology is often what makes a model look blocked).
+ * Simplify a model for visualisation or an analysis that doesn't care about
+ compartments.
+ * As a pre-step for localisation when the user does want RAVEN's
+ "start from scratch" workflow (call :func:`merge_compartments` then
+ :func:`raven_python.localization.predict_localization` with the full reaction list).
+
+ Metabolites that already share a base id (e.g. ``glc__D_c`` and ``glc__D_e`` both
+ map to ``glc__D``) collapse into one entity in the merged compartment; their
+ stoichiometric contributions are summed per reaction. Reactions that end up with
+ only one metabolite (e.g. ``A[c] → A[m]`` becomes ``A → A`` = nothing) are deleted
+ by default (RAVEN's ``deleteRxnsWithOneMet``). Reactions that become identical
+ after merging are deduplicated (one survives).
+ """
+ out = model.copy()
+
+ # 1. For each metabolite, derive a base id (strip the trailing _).
+ # Two mets in different compartments sharing the base id collapse to one.
+ new_to_old: dict[str, list[cobra.Metabolite]] = {}
+ for m in list(out.metabolites):
+ base = _base_id(m)
+ new_to_old.setdefault(base, []).append(m)
+
+ # 2. Build the merged metabolites and rewrite reactions.
+ canonical: dict[str, cobra.Metabolite] = {}
+ for base, mets in new_to_old.items():
+ proto = mets[0]
+ new_met = cobra.Metabolite(base, name=proto.name, compartment=merged_id,
+ formula=proto.formula, charge=proto.charge)
+ new_met.notes = dict(proto.notes or {})
+ canonical[base] = new_met
+
+ # Rewrite all reactions: replace each metabolite with its canonical, summing
+ # coefficients where multiple original mets collapse to one.
+ rewritten: dict[str, dict[str, float]] = {}
+ for r in list(out.reactions):
+ new_stoich: dict[cobra.Metabolite, float] = {}
+ for m, coeff in list(r.metabolites.items()):
+ canon = canonical[_base_id(m)]
+ new_stoich[canon] = new_stoich.get(canon, 0.0) + coeff
+ # Drop zero net coefficients (substrate + product of the same base met cancel).
+ new_stoich = {m: c for m, c in new_stoich.items() if c != 0.0}
+ rewritten[r.id] = {m.id: c for m, c in new_stoich.items()}
+
+ # Now build a fresh model with the canonical mets + rewritten reactions; the
+ # cobra in-place rewrite would require careful constraint surgery, so a clean
+ # rebuild is simpler and less error-prone.
+ merged = cobra.Model(out.id or "merged")
+ merged.compartments = {merged_id: merged_name}
+ merged.add_metabolites(list(canonical.values()))
+ deleted_single: list[str] = []
+ deduplicated: list[str] = []
+ seen_signatures: dict[tuple, str] = {}
+ keep_reactions: list[cobra.Reaction] = []
+ for r in out.reactions:
+ stoich = rewritten[r.id]
+ if drop_single_metabolite_reactions and len(stoich) <= 1:
+ deleted_single.append(r.id)
+ continue
+ if not stoich: # everything cancelled
+ deleted_single.append(r.id)
+ continue
+ sig = (frozenset(stoich.items()), bool(r.lower_bound < 0), bool(r.upper_bound > 0))
+ if deduplicate_reactions and sig in seen_signatures:
+ deduplicated.append(r.id)
+ continue
+ seen_signatures[sig] = r.id
+ new_r = cobra.Reaction(r.id, name=r.name, lower_bound=r.lower_bound,
+ upper_bound=r.upper_bound)
+ new_r.add_metabolites({merged.metabolites.get_by_id(mid): c for mid, c in stoich.items()})
+ new_r.gene_reaction_rule = r.gene_reaction_rule
+ if r.subsystem:
+ new_r.subsystem = r.subsystem
+ new_r.notes = dict(r.notes or {})
+ keep_reactions.append(new_r)
+ merged.add_reactions(keep_reactions)
+ return merged, deleted_single, deduplicated
+
+
+def copy_to_compartment(
+ model: cobra.Model,
+ reactions: Iterable[str],
+ target_compartment: str,
+ *,
+ target_compartment_name: str | None = None,
+ delete_original: bool = False,
+ id_suffix: str | None = None,
+) -> tuple[cobra.Model, list[str], list[str]]:
+ """Copy a set of reactions into ``target_compartment``. RAVEN's ``copyToComps``.
+
+ Returns ``(model_copy, new_reaction_ids, new_metabolite_ids)``. Use cases:
+
+ * Build a dual-localised pathway (e.g. duplicate glycolysis into a peroxisome).
+ * Mirror a curated subsystem into an additional compartment as a draft to refine.
+ * Set up the input for a flux comparison between alternate compartmentalisations.
+
+ Each copied reaction is given the id ``"_"`` (default
+ ``id_suffix=target_compartment``); each metabolite it touches is mapped to (or
+ created in) ``target_compartment`` with the same suffix convention. ``delete_original=True``
+ moves the reactions instead of copying.
+ """
+ out = model.copy()
+ suffix = id_suffix if id_suffix is not None else target_compartment
+ if target_compartment not in out.compartments:
+ out.compartments = {**out.compartments,
+ target_compartment: target_compartment_name or target_compartment}
+
+ preexisting_met_ids = {x.id for x in out.metabolites}
+ new_rxn_ids: list[str] = []
+ for rid in list(reactions):
+ if rid not in out.reactions:
+ raise ValueError(f"reaction {rid!r} not in model")
+ src = out.reactions.get_by_id(rid)
+ new_id = f"{rid}_{suffix}"
+ if new_id in out.reactions:
+ continue # already copied; idempotent
+ new_stoich: dict[cobra.Metabolite, float] = {}
+ for m, coeff in src.metabolites.items():
+ target_met = _met_in_compartment(out, m, target_compartment, suffix=suffix)
+ new_stoich[target_met] = coeff
+ new_r = cobra.Reaction(new_id, name=src.name,
+ lower_bound=src.lower_bound, upper_bound=src.upper_bound)
+ new_r.add_metabolites(new_stoich)
+ new_r.gene_reaction_rule = src.gene_reaction_rule
+ if src.subsystem:
+ new_r.subsystem = src.subsystem
+ new_r.notes = dict(src.notes or {})
+ out.add_reactions([new_r])
+ new_rxn_ids.append(new_id)
+ if delete_original:
+ out.remove_reactions([src.id], remove_orphans=False)
+
+ new_met_ids = [m.id for m in out.metabolites if m.id not in preexisting_met_ids]
+ return out, new_rxn_ids, new_met_ids
+
+
+# ----------------------------------------------------------------- helpers
+
+def _base_id(m: cobra.Metabolite) -> str:
+ """Strip the trailing ``_`` suffix from a metabolite id (if present)."""
+ if m.compartment and m.id.endswith(f"_{m.compartment}"):
+ return m.id[: -(len(m.compartment) + 1)]
+ return m.id
+
+
+def _met_in_compartment(model: cobra.Model, source: cobra.Metabolite,
+ compartment: str, *, suffix: str | None = None) -> cobra.Metabolite:
+ """Return (creating if needed) the copy of ``source`` in ``compartment``.
+
+ The new metabolite id is ``"_"`` (default ``suffix=compartment``).
+ Already-existing copies are reused.
+ """
+ if source.compartment == compartment:
+ return source
+ base = _base_id(source)
+ new_id = f"{base}_{suffix if suffix is not None else compartment}"
+ if new_id in model.metabolites:
+ return model.metabolites.get_by_id(new_id)
+ new_met = cobra.Metabolite(new_id, name=source.name, compartment=compartment,
+ formula=source.formula, charge=source.charge)
+ new_met.notes = dict(source.notes or {})
+ model.add_metabolites([new_met])
+ return new_met
diff --git a/src/raven_python/manipulation/expand.py b/src/raven_python/manipulation/expand.py
new file mode 100644
index 0000000..246f3b9
--- /dev/null
+++ b/src/raven_python/manipulation/expand.py
@@ -0,0 +1,124 @@
+"""Expand reactions with isozymes into one reaction per isozyme.
+
+Operates on cobra's GPR AST, so the model stays a plain ``cobra.Model`` throughout.
+
+Provenance: this implementation was first written for geckopy
+(``geckopy/ec_model/pipeline/expand.py``, where it backed makeEcModel stage 5)
+and is adopted here as its canonical home; geckopy will import it from raven_python
+once raven_python is published.
+
+MATLAB-COMPAT: GECKO MATLAB and RAVEN ``expandModel.m`` use string manipulation
+on grRules to detect and split isozymes. raven_python uses cobrapy's GPR AST
+instead. Output should be equivalent for any well-formed GPR; cases that differ
+are likely malformed GPR strings that the AST flags as invalid.
+"""
+from __future__ import annotations
+
+import ast
+import copy
+
+import cobra
+from cobra.core.gene import GPR
+
+
+def _gpr_to_dnf(gpr: GPR) -> list[list[str]]:
+ """Convert a GPR to disjunctive normal form (list of AND-clauses).
+
+ An empty GPR yields an empty list. A single clause (no OR anywhere)
+ yields a list of length 1. OR-of-ANDs yields one sublist per
+ disjunct, each containing the gene names ANDed together.
+
+ Handles distributivity: ``g1 and (g2 or g3)`` becomes
+ ``[[g1, g2], [g1, g3]]``.
+ """
+ if gpr is None or gpr.body is None:
+ return []
+ return _node_to_dnf(gpr.body)
+
+
+def _node_to_dnf(node) -> list[list[str]]:
+ """Recursive helper. Returns DNF as list of AND-clauses."""
+ if isinstance(node, ast.Name):
+ return [[node.id]]
+ if isinstance(node, ast.BoolOp):
+ if isinstance(node.op, ast.Or):
+ result: list[list[str]] = []
+ for child in node.values:
+ result.extend(_node_to_dnf(child))
+ return result
+ if isinstance(node.op, ast.And):
+ clauses: list[list[str]] = [[]]
+ for child in node.values:
+ child_dnf = _node_to_dnf(child)
+ new_clauses: list[list[str]] = []
+ for existing in clauses:
+ for extra in child_dnf:
+ new_clauses.append(existing + extra)
+ clauses = new_clauses
+ return clauses
+ raise ValueError(f"Unexpected GPR node type: {type(node).__name__}")
+
+
+def expand_model(model: cobra.Model) -> list[str]:
+ """Split reactions with isozymes (OR in GPR) into one reaction per isozyme.
+ For each reaction whose GPR contains at least one OR, the reaction
+ is removed and replaced by one copy per disjunctive clause. The new
+ reactions get ID suffix ``_EXP_1``, ``_EXP_2``, etc. All other
+ fields (stoichiometry, bounds, name, subsystem) are copied verbatim;
+ only the GPR is simplified to the single AND-clause for that
+ isozyme.
+
+ Reactions with no GPR, or with a GPR that has no OR, are left
+ untouched.
+
+ Parameters
+ ----------
+ model
+ A cobra.Model, mutated in place.
+
+ Returns
+ -------
+ list of str
+ Sorted IDs of newly added expanded reactions (those with
+ ``_EXP_N`` suffixes). The original reactions that were split
+ are no longer in the model.
+ """
+ expansions: list[tuple[cobra.Reaction, list[list[str]]]] = []
+
+ for rxn in model.reactions:
+ if not rxn.gene_reaction_rule:
+ continue
+ clauses = _gpr_to_dnf(rxn.gpr)
+ if len(clauses) <= 1:
+ continue
+ expansions.append((rxn, clauses))
+
+ added_ids: list[str] = []
+ for original_rxn, clauses in expansions:
+ new_rxns: list[cobra.Reaction] = []
+ for i, clause in enumerate(clauses, start=1):
+ new_rxn = cobra.Reaction(
+ id=f"{original_rxn.id}_EXP_{i}",
+ name=original_rxn.name,
+ )
+ new_rxn.lower_bound = original_rxn.lower_bound
+ new_rxn.upper_bound = original_rxn.upper_bound
+ new_rxn.add_metabolites(dict(original_rxn.metabolites.items()))
+ new_rxn.subsystem = original_rxn.subsystem
+ new_rxn.gene_reaction_rule = " and ".join(clause)
+ # Propagate per-reaction metadata (notably ec-code / annotations)
+ # so downstream functions see the same annotations on expanded
+ # reactions as on the original. Deep-copy so siblings are independent.
+ new_rxn.annotation = copy.deepcopy(original_rxn.annotation)
+ new_rxn.notes = copy.deepcopy(original_rxn.notes)
+ new_rxns.append(new_rxn)
+
+ obj_coeff = original_rxn.objective_coefficient
+ model.remove_reactions([original_rxn])
+ model.add_reactions(new_rxns)
+ if obj_coeff: # keep the original in the objective — sum over its isozyme copies
+ for new_rxn in new_rxns:
+ new_rxn.objective_coefficient = obj_coeff
+ added_ids.extend(r.id for r in new_rxns)
+
+ return sorted(added_ids)
diff --git a/src/raven_python/manipulation/irreversible.py b/src/raven_python/manipulation/irreversible.py
new file mode 100644
index 0000000..3f64a68
--- /dev/null
+++ b/src/raven_python/manipulation/irreversible.py
@@ -0,0 +1,72 @@
+"""Convert reversible reactions to an irreversible (forward + reverse) form.
+
+cobrapy's own ``convert_to_irreversible`` was removed, so this is a genuine
+implementation rather than a wrapper.
+
+Provenance: first written for geckopy
+(``geckopy/ec_model/pipeline/preprocess.py``, makeEcModel stage 4, tagged
+"RAVENpy candidate") and adopted here as its canonical home; geckopy will
+import it from raven_python once raven_python is published.
+"""
+from __future__ import annotations
+
+import cobra
+
+
+def convert_to_irreversible(model: cobra.Model) -> list[str]:
+ """Split non-exchange reversible reactions into a forward + reverse pair.
+ For each non-exchange reaction with ``lb < 0``:
+
+ - The original reaction is kept as the forward direction. Its
+ lower bound is clamped to 0.
+ - A new reaction with the same ID plus a ``_REV`` suffix is added,
+ representing the reverse direction. Its stoichiometry is the
+ negation of the original, its bounds are ``(0, -original_lb)``,
+ and it inherits the name (with " (reversible)" appended) and the
+ gene-protein rule of the original.
+
+ Exchange reactions (boundary reactions) are never split, regardless
+ of their bounds, matching MATLAB behavior where exchange reactions
+ are explicitly excluded from ``convertToIrrev``.
+
+ Parameters
+ ----------
+ model
+ A cobra.Model, mutated in place.
+
+ Returns
+ -------
+ list of str
+ Sorted IDs of newly added reverse reactions (the ones ending in
+ ``_REV``). The forward reactions retain their original IDs.
+ """
+ reverse_rxns_to_add: list[cobra.Reaction] = []
+ forward_updates: list[cobra.Reaction] = []
+
+ for rxn in model.reactions:
+ if rxn.boundary:
+ continue
+ if rxn.lower_bound >= 0:
+ continue
+
+ original_lb = rxn.lower_bound
+
+ rev_rxn = cobra.Reaction(
+ id=f"{rxn.id}_REV",
+ name=(f"{rxn.name} (reversible)" if rxn.name else f"{rxn.id}_REV"),
+ )
+ rev_rxn.lower_bound = 0.0
+ rev_rxn.upper_bound = -original_lb
+ rev_rxn.add_metabolites({m: -c for m, c in rxn.metabolites.items()})
+ rev_rxn.gene_reaction_rule = rxn.gene_reaction_rule
+
+ reverse_rxns_to_add.append(rev_rxn)
+ forward_updates.append(rxn)
+
+ for rxn in forward_updates:
+ rxn.lower_bound = 0.0
+
+ if reverse_rxns_to_add:
+ model.add_reactions(reverse_rxns_to_add)
+
+ return sorted(r.id for r in reverse_rxns_to_add)
diff --git a/src/raven_python/manipulation/merge.py b/src/raven_python/manipulation/merge.py
new file mode 100644
index 0000000..bfa1f24
--- /dev/null
+++ b/src/raven_python/manipulation/merge.py
@@ -0,0 +1,146 @@
+"""Merge several models into one.
+
+cobra's ``Model.merge`` is pairwise and matches everything strictly by id; this
+merges **N** models and unifies metabolites by **name[compartment]** (so the same
+compound under different ids in two models becomes one), while adding **all**
+reactions without de-duplication
+(a reaction whose ID already exists is renamed ``id_``). Genes are
+unified by ID. Provenance (which source model each object came from) is recorded
+in ``notes['origin']``.
+
+The bulk of RAVEN's function is struct field-padding and manual S-matrix
+assembly, none of which is needed on ``cobra.Model``.
+"""
+from __future__ import annotations
+
+import copy
+import warnings
+from collections.abc import Iterable
+
+import cobra
+from cobra import Metabolite, Model, Reaction
+
+
+def _unique_id(existing, base: str, suffix: str) -> str:
+ """Return base, or base_suffix (then base_suffix_2, ...) if it collides."""
+ if base not in existing:
+ return base
+ candidate = f"{base}_{suffix}"
+ n = 2
+ while candidate in existing:
+ candidate = f"{base}_{suffix}_{n}"
+ n += 1
+ return candidate
+
+
+def merge_models(
+ models: Iterable[cobra.Model],
+ *,
+ match_by: str = "name",
+ track_origin: bool = True,
+) -> cobra.Model:
+ """Merge models into a single new model.
+ Parameters
+ ----------
+ models
+ The models to merge (two or more). A single model is returned as a copy.
+ match_by
+ How metabolites are unified across models: ``"name"`` (default) treats
+ metabolites with the same *name and compartment* as identical (IDs
+ ignored); ``"id"`` matches by metabolite ID.
+ track_origin
+ If True (default), record the source model's ``id`` in each reaction's,
+ metabolite's, and gene's ``notes['origin']``.
+
+ Returns
+ -------
+ cobra.Model
+ A new merged model (``id="MERGED"``). Reactions are **not** de-duplicated
+ — matching RAVEN, every reaction from every model is kept, with ID
+ collisions renamed ``id_``.
+ """
+ models = list(models)
+ if not models:
+ raise ValueError("merge_models requires at least one model.")
+ if match_by not in ("name", "id"):
+ raise ValueError(f"match_by must be 'name' or 'id', got {match_by!r}")
+ if len(models) == 1:
+ return models[0].copy()
+
+ merged = Model("MERGED")
+ comp_names: dict[str, str] = {}
+ met_lookup: dict = {} # name/comp or id key -> merged Metabolite
+
+ def met_key(met: Metabolite):
+ return (met.name, met.compartment) if match_by == "name" else met.id
+
+ def ensure_metabolite(src: Metabolite, origin: str) -> Metabolite:
+ key = met_key(src)
+ if key in met_lookup:
+ existing = met_lookup[key]
+ # Two source models can map to the same name[comp] (or id) with
+ # different formula/charge; silently picking the first-seen has
+ # quietly corrupted mass balance in the past. Warn so the caller
+ # sees the conflict.
+ if src.formula and existing.formula and src.formula != existing.formula:
+ warnings.warn(
+ f"merge_models: metabolite {existing.id!r} (from earlier model) "
+ f"and {src.id!r} (from {origin!r}) share key {key!r} but "
+ f"have different formulas ({existing.formula!r} vs {src.formula!r}); "
+ "keeping the first.",
+ stacklevel=3,
+ )
+ if (
+ existing.charge is not None
+ and src.charge is not None
+ and existing.charge != src.charge
+ ):
+ warnings.warn(
+ f"merge_models: metabolite {existing.id!r} (from earlier model) "
+ f"and {src.id!r} (from {origin!r}) share key {key!r} but "
+ f"have different charges ({existing.charge} vs {src.charge}); "
+ "keeping the first.",
+ stacklevel=3,
+ )
+ return existing
+ new_id = _unique_id(merged.metabolites, src.id, origin)
+ new_met = Metabolite(
+ new_id, name=src.name, compartment=src.compartment,
+ formula=src.formula, charge=src.charge,
+ )
+ new_met.annotation = copy.deepcopy(src.annotation)
+ new_met.notes = copy.deepcopy(src.notes)
+ if track_origin:
+ new_met.notes.setdefault("origin", origin)
+ merged.add_metabolites([new_met])
+ met_lookup[key] = new_met
+ return new_met
+
+ for model in models:
+ origin = model.id or "model"
+ comp_names.update(model.compartments)
+ genes_before = {g.id for g in merged.genes}
+
+ for rxn in model.reactions:
+ new_id = _unique_id(merged.reactions, rxn.id, origin)
+ new_rxn = Reaction(new_id, name=rxn.name)
+ new_rxn.bounds = rxn.bounds
+ new_rxn.subsystem = rxn.subsystem
+ merged.add_reactions([new_rxn])
+ new_rxn.add_metabolites(
+ {ensure_metabolite(m, origin): coef for m, coef in rxn.metabolites.items()}
+ )
+ if rxn.gene_reaction_rule:
+ new_rxn.gene_reaction_rule = rxn.gene_reaction_rule
+ new_rxn.annotation = copy.deepcopy(rxn.annotation)
+ new_rxn.notes = copy.deepcopy(rxn.notes)
+ if track_origin:
+ new_rxn.notes.setdefault("origin", origin)
+
+ if track_origin:
+ for gene in merged.genes:
+ if gene.id not in genes_before:
+ gene.notes.setdefault("origin", origin)
+
+ merged._compartments.update(comp_names)
+ return merged
diff --git a/src/raven_python/manipulation/parameters.py b/src/raven_python/manipulation/parameters.py
new file mode 100644
index 0000000..f349804
--- /dev/null
+++ b/src/raven_python/manipulation/parameters.py
@@ -0,0 +1,78 @@
+"""Set reaction bounds to a sign-aware ±% variance band around measured values.
+
+Cobra has no idiom for the *variance band* case (e.g. "5 ± 20 %"); the other common
+bound-setting cases are cobra one-liners:
+
+* fixed lb / ub → ``reaction.lower_bound`` / ``upper_bound`` / ``reaction.bounds``
+* equality → ``reaction.bounds = (v, v)``
+* objective → ``model.objective = {reaction: coeff}``
+* unconstrained → ``reaction.bounds = cobra.Configuration().bounds``
+"""
+from __future__ import annotations
+
+from collections.abc import Iterable, Sequence
+
+import cobra
+from cobra import Reaction
+
+Number = int | float
+
+
+def _resolve(model: cobra.Model, reactions) -> list[Reaction]:
+ if isinstance(reactions, (str, Reaction)):
+ reactions = [reactions]
+ out: list[Reaction] = []
+ for r in reactions:
+ if isinstance(r, Reaction):
+ out.append(r)
+ elif r in model.reactions:
+ out.append(model.reactions.get_by_id(r))
+ else:
+ raise ValueError(f"Reaction {r!r} not found in the model.")
+ return out
+
+
+def _broadcast(value, n: int) -> list[float]:
+ if isinstance(value, (int, float)):
+ return [float(value)] * n
+ vals = [float(v) for v in value]
+ if len(vals) != n:
+ raise ValueError(
+ f"Expected 1 or {n} values to match the reactions, got {len(vals)}."
+ )
+ return vals
+
+
+def set_variance_bounds(
+ model: cobra.Model,
+ reactions: str | Reaction | Iterable,
+ values: Number | Sequence[Number],
+ percent: Number,
+) -> list[Reaction]:
+ """Constrain reactions to a ``±percent/2`` band around measured values.
+
+ For a measured value ``v`` and ``percent`` ``p``, the bounds become
+ ``v * (1 - p/200) .. v * (1 + p/200)`` — i.e. ``percent`` is the *total*
+ width, split half above and half below. For a negative ``v`` the two are
+ swapped so that ``lb <= ub``. E.g. ``percent=5`` gives 97.5 %..102.5 % of ``v``.
+
+ Parameters
+ ----------
+ reactions
+ Reaction IDs or objects.
+ values
+ Measured value per reaction; a scalar is broadcast to all reactions.
+ percent
+ Total band width as a percentage.
+
+ Returns
+ -------
+ list of cobra.Reaction
+ The reactions affected.
+ """
+ rxns = _resolve(model, reactions)
+ half = percent / 200.0
+ for rxn, v in zip(rxns, _broadcast(values, len(rxns)), strict=True):
+ lo, hi = v * (1 - half), v * (1 + half)
+ rxn.bounds = (hi, lo) if v < 0 else (lo, hi)
+ return rxns
diff --git a/src/raven_python/manipulation/remove.py b/src/raven_python/manipulation/remove.py
new file mode 100644
index 0000000..492de36
--- /dev/null
+++ b/src/raven_python/manipulation/remove.py
@@ -0,0 +1,120 @@
+"""Remove metabolites or genes from a model.
+
+For removing *reactions*, use cobra directly:
+``cobra.Model.remove_reactions(reactions, remove_orphans=...)``.
+
+The two functions here delegate the core to cobra and add the cobra-absent behaviour:
+
+* ``remove_metabolites`` — cobra matches metabolites by ID; RAVEN's ``isNames``
+ deletes a metabolite in **every compartment at once** by name. That name
+ resolution is the *sole* reason this wrapper exists (see the note on it).
+* ``remove_genes`` — cobra's ``cobra.manipulation.remove_genes`` already rewrites
+ GPRs through the boolean AST (removing one gene of ``A and B`` empties the
+ rule, of ``A or B`` keeps the other) — exactly RAVEN's intent, without its
+ ``eval``. The gap is RAVEN's default of **constraining** flux-blocked reactions
+ to zero instead of deleting them; exposed as ``blocked_reactions``.
+"""
+from __future__ import annotations
+
+from collections.abc import Iterable
+
+import cobra
+from cobra import Gene, Metabolite
+from cobra.manipulation import remove_genes as _cobra_remove_genes
+
+
+def _as_list(obj) -> list:
+ if isinstance(obj, (str, Metabolite, Gene)):
+ return [obj]
+ return list(obj)
+
+
+def remove_metabolites(
+ model: cobra.Model,
+ metabolites: str | Metabolite | Iterable,
+ *,
+ by_name: bool = False,
+ destructive: bool = False,
+) -> None:
+ """Remove metabolites, optionally matching by name across all compartments.
+
+ Parameters
+ ----------
+ by_name
+ If True, ``metabolites`` are metabolite *names*; every metabolite with a
+ matching name is removed, regardless of compartment (RAVEN ``isNames``).
+ If False, they are IDs/objects, resolved via cobra.
+ destructive
+ Passed to cobra: if True, also remove every reaction the metabolite
+ participates in.
+
+ Note
+ ----
+ With ``by_name=False`` this is just ``model.remove_metabolites`` — the
+ ``by_name`` cross-compartment deletion is the only thing this adds over cobra.
+ """
+ if by_name:
+ wanted = set(_as_list(metabolites))
+ targets = [m for m in model.metabolites if m.name in wanted]
+ else:
+ targets = model.metabolites.get_by_any(_as_list(metabolites))
+ if targets:
+ model.remove_metabolites(targets, destructive=destructive)
+
+
+def remove_genes(
+ model: cobra.Model,
+ genes: str | Gene | Iterable,
+ *,
+ blocked_reactions: str = "remove",
+ remove_orphans: bool = False,
+) -> list[str]:
+ """Remove genes and handle reactions left unable to carry flux.
+
+ GPR rewriting (with correct AND/OR semantics) and gene deletion are done by cobra;
+ this adds a policy for reactions whose GPR becomes empty (no enzyme left):
+
+ * ``"remove"`` — delete them (cobra's default).
+ * ``"constrain"`` — keep them but set bounds to ``(0, 0)``.
+ * ``"keep"`` — leave them with an empty GPR and unchanged bounds.
+
+ ``remove_orphans`` (only meaningful with ``blocked_reactions="remove"``)
+ passes through to cobra: drop metabolites *and* genes orphaned by the removal.
+
+ Returns
+ -------
+ list of str
+ IDs of the reactions that became flux-blocked (had a GPR, now empty).
+ """
+ if blocked_reactions not in ("remove", "constrain", "keep"):
+ raise ValueError(
+ f"blocked_reactions must be 'remove', 'constrain', or 'keep', "
+ f"got {blocked_reactions!r}"
+ )
+
+ # Resolve to gene IDs that are actually in the model (RAVEN filters likewise).
+ requested = [g.id if isinstance(g, Gene) else g for g in _as_list(genes)]
+ present = [gid for gid in requested if gid in model.genes]
+ if not present:
+ return []
+
+ # Reactions touched by these genes that currently have a GPR.
+ affected = set()
+ for gid in present:
+ affected.update(r.id for r in model.genes.get_by_id(gid).reactions)
+ had_gpr = {rid for rid in affected if model.reactions.get_by_id(rid).gene_reaction_rule}
+
+ # cobra rewrites GPRs (AST) and removes the gene objects; we manage reactions.
+ _cobra_remove_genes(model, present, remove_reactions=False)
+
+ blocked = [
+ rid for rid in had_gpr if not model.reactions.get_by_id(rid).gene_reaction_rule
+ ]
+
+ if blocked_reactions == "remove":
+ model.remove_reactions(blocked, remove_orphans=remove_orphans)
+ elif blocked_reactions == "constrain":
+ for rid in blocked:
+ model.reactions.get_by_id(rid).bounds = (0, 0)
+
+ return sorted(blocked)
diff --git a/src/raven_python/manipulation/simplify.py b/src/raven_python/manipulation/simplify.py
new file mode 100644
index 0000000..2deaccd
--- /dev/null
+++ b/src/raven_python/manipulation/simplify.py
@@ -0,0 +1,229 @@
+"""Reduce a model by removing/merging reactions that cannot carry flux.
+
+Four reduction modes that cobra does not cover out of the box:
+``remove_dead_end_reactions`` (reactions whose substrates have no producer),
+``remove_duplicate_reactions``, ``constrain_reversible_reactions`` (tighten bounds
+via FVA), and ``group_linear_reactions`` (lossy fold of unit-stoichiometry chains
+into one reaction; drops gene rules).
+
+Cobra-covered modes that you'd reach for separately:
+
+* No-flux removal → ``cobra.flux_analysis.find_blocked_reactions``.
+* Zero-interval removal → filter reactions with ``bounds == (0, 0)`` then prune.
+"""
+from __future__ import annotations
+
+import math
+from collections.abc import Iterable
+
+import cobra
+from cobra.flux_analysis import flux_variability_analysis
+
+from raven_python.manipulation.irreversible import convert_to_irreversible
+
+
+def _prune_orphan_metabolites(model: cobra.Model) -> list[str]:
+ orphans = [m for m in model.metabolites if not m.reactions]
+ if orphans:
+ model.remove_metabolites(orphans)
+ return [m.id for m in orphans]
+
+
+def _can_produce_and_consume(met) -> tuple[bool, bool]:
+ """Whether the network can both produce and consume ``met`` (given directions)."""
+ produce = consume = False
+ for rxn in met.reactions:
+ coef = rxn.get_coefficient(met)
+ if coef > 0:
+ produce |= rxn.upper_bound > 0
+ consume |= rxn.lower_bound < 0
+ elif coef < 0:
+ consume |= rxn.upper_bound > 0
+ produce |= rxn.lower_bound < 0
+ return produce, consume
+
+
+def remove_dead_end_reactions(
+ model: cobra.Model, *, reserved: Iterable[str] | None = None
+) -> tuple[list[str], list[str]]:
+ """Iteratively remove dead-end reactions and metabolites.
+
+ A metabolite
+ is a dead end if it participates in only one reaction, or if (accounting for
+ reaction directionality) it can only be produced or only consumed — such
+ metabolites cannot carry steady-state flux, so the reactions touching them
+ are removed. Repeats until stable.
+
+ Returns ``(removed_reaction_ids, removed_metabolite_ids)``.
+ """
+ reserved = set(reserved or [])
+ removed_rxns: list[str] = []
+ removed_mets: list[str] = []
+ while True:
+ removed_mets += _prune_orphan_metabolites(model)
+ dead = [
+ m
+ for m in model.metabolites
+ if len(m.reactions) <= 1 or not all(_can_produce_and_consume(m))
+ ]
+ if not dead:
+ break
+ rxns = {r for m in dead for r in m.reactions}
+ to_delete = [r for r in rxns if r.id not in reserved]
+ if not to_delete:
+ break
+ removed_rxns += [r.id for r in to_delete]
+ model.remove_reactions(to_delete)
+ return removed_rxns, removed_mets
+
+
+def _signature(rxn):
+ mets = frozenset((m.id, c) for m, c in rxn.metabolites.items())
+ return (mets, rxn.lower_bound, rxn.upper_bound, rxn.objective_coefficient)
+
+
+def remove_duplicate_reactions(
+ model: cobra.Model, *, reserved: Iterable[str] | None = None
+) -> list[str]:
+ """Remove all-but-one of each set of duplicate reactions.
+
+ Reactions are duplicates when they have identical stoichiometry, bounds, and
+ objective coefficient. One of each set is kept (reserved reactions are never
+ removed). Returns the removed reaction IDs.
+ """
+ reserved = set(reserved or [])
+ groups: dict = {}
+ for rxn in model.reactions:
+ groups.setdefault(_signature(rxn), []).append(rxn)
+
+ removed: list[str] = []
+ for rxns in groups.values():
+ if len(rxns) <= 1:
+ continue
+ keep = rxns[-1]
+ to_remove = [r for r in rxns if r is not keep and r.id not in reserved]
+ if to_remove:
+ removed += [r.id for r in to_remove]
+ model.remove_reactions(to_remove)
+ return removed
+
+
+def constrain_reversible_reactions(
+ model: cobra.Model, *, eps: float = 1e-9
+) -> list[str]:
+ """Constrain reversible reactions that can only carry flux one way.
+
+ Runs FVA on
+ each reversible reaction; if it can only carry forward flux its lower bound
+ is set to 0, and if it can only carry reverse flux it is flipped to a forward
+ reaction (stoichiometry, bounds, and objective negated). Returns the changed
+ reaction IDs.
+ """
+ revs = [r for r in model.reactions if r.lower_bound < 0 < r.upper_bound]
+ if not revs:
+ return []
+ # Infeasible models surface as either OptimizationError (Gurobi/HiGHS) or
+ # NaN-filled ranges (some optlang backends silently). Catch both and raise
+ # a single clear error — the original ``abs(NaN) < eps`` comparison would
+ # have silently no-op'd, letting bogus "all reactions truly reversible"
+ # decisions sneak through.
+ try:
+ fva = flux_variability_analysis(
+ model, reaction_list=revs, fraction_of_optimum=0.0
+ )
+ except Exception as exc: # noqa: BLE001 - solver-family agnostic
+ raise RuntimeError(
+ "constrain_reversible_reactions: FVA failed — the model is likely "
+ "infeasible at fraction_of_optimum=0. Fix the infeasibility first "
+ "(often a missing exchange or an over-constrained essential). "
+ f"({exc})"
+ ) from exc
+ if fva[["minimum", "maximum"]].isna().any().any():
+ raise RuntimeError(
+ "constrain_reversible_reactions: FVA returned NaN ranges — the "
+ "model is infeasible at fraction_of_optimum=0. Fix the infeasibility "
+ "first (often a missing exchange or an over-constrained essential)."
+ )
+
+ changed: list[str] = []
+ for rxn in revs:
+ lo = fva.at[rxn.id, "minimum"]
+ hi = fva.at[rxn.id, "maximum"]
+ # Guard against ±inf ranges (unbounded objective): treat them as truly
+ # reversible rather than "zero" by the abs(·) < eps check.
+ if math.isinf(lo) or math.isinf(hi):
+ continue
+ min_zero, max_zero = abs(lo) < eps, abs(hi) < eps
+ if min_zero == max_zero: # both ~0 (blocked) or both nonzero (truly reversible)
+ continue
+ if max_zero: # only reverse flux → flip to a forward reaction
+ old_lb = rxn.lower_bound
+ rxn.add_metabolites({m: -2 * c for m, c in rxn.metabolites.items()})
+ rxn.bounds = (0.0, -old_lb)
+ rxn.objective_coefficient = -rxn.objective_coefficient
+ else: # only forward flux
+ rxn.lower_bound = 0.0
+ changed.append(rxn.id)
+ return changed
+
+
+def group_linear_reactions(
+ model: cobra.Model, *, reserved: Iterable[str] | None = None
+) -> None:
+ """Merge linear (single-producer, single-consumer) reaction chains.
+
+ **Lossy**: gene-reaction
+ associations are discarded (RAVEN does the same), since merged reactions have
+ no meaningful combined GPR. The model is first made irreversible, then any
+ metabolite that is produced by exactly one reaction and consumed by exactly
+ one reaction is eliminated by merging the two reactions. Mutates in place.
+ """
+ reserved = set(reserved or [])
+
+ # Lossy: drop all gene information.
+ for rxn in model.reactions:
+ rxn.gene_reaction_rule = ""
+ for gene in list(model.genes):
+ model.genes.remove(gene)
+
+ convert_to_irreversible(model)
+
+ # Worklist of metabolites to (re)consider for merging. Each metabolite
+ # participating in a merge can expose new linear chains in its neighbours,
+ # so we re-enqueue the touched mets rather than restart the whole scan
+ # (the old O(n²·m) restart-after-every-merge loop).
+ pending: list = list(model.metabolites)
+ seen_in_pass: set = set()
+ while pending:
+ met = pending.pop()
+ if met not in model.metabolites: # removed in a previous merge
+ continue
+ rxns = list(met.reactions)
+ if len(rxns) != 2 or any(r.id in reserved for r in rxns):
+ continue
+ r1, r2 = rxns
+ c1, c2 = r1.get_coefficient(met), r2.get_coefficient(met)
+ if (c1 > 0) == (c2 > 0): # need one producer and one consumer
+ continue
+ ratio = abs(c1 / c2)
+ new_lb = max(r1.lower_bound, r2.lower_bound / ratio)
+ new_ub = min(r1.upper_bound, r2.upper_bound / ratio)
+ new_obj = r1.objective_coefficient + r2.objective_coefficient * ratio
+ # Re-enqueue every metabolite touched by either side — the merge can
+ # turn neighbours into single-producer/consumer chains in turn.
+ touched = {m for m in r1.metabolites} | {m for m in r2.metabolites}
+ # Merge r2*ratio into r1; the shared metabolite cancels and is dropped.
+ r1.add_metabolites({m: c * ratio for m, c in r2.metabolites.items()})
+ model.remove_reactions([r2])
+ r1.bounds = (new_lb, new_ub)
+ r1.objective_coefficient = new_obj
+ seen_in_pass.clear()
+ for m in touched:
+ if m in model.metabolites and id(m) not in seen_in_pass:
+ seen_in_pass.add(id(m))
+ pending.append(m)
+ # One terminal cleanup pass (cheap; only what remains).
+ empty = [r for r in model.reactions if not r.metabolites]
+ if empty:
+ model.remove_reactions(empty)
+ _prune_orphan_metabolites(model)
diff --git a/src/raven_python/manipulation/transfer.py b/src/raven_python/manipulation/transfer.py
new file mode 100644
index 0000000..b867f02
--- /dev/null
+++ b/src/raven_python/manipulation/transfer.py
@@ -0,0 +1,144 @@
+"""Copy reactions (with their metabolites and genes) from another model.
+
+cobra's ``Model.merge`` / ``add_reactions`` match metabolites strictly by id. This
+transfers a chosen set of reactions from a *source* model into a draft, matching
+metabolites by **name[compartment]** instead — so a compound present in both models
+under different ids is reused rather than duplicated, and only genuinely new
+metabolites are created (copying the source's id, formula,
+charge, and annotation). New genes are auto-created by cobra when the GPR is set.
+This is the post-``getModelFromHomology`` "copy a few more reactions across"
+workflow.
+"""
+from __future__ import annotations
+
+import copy
+from collections.abc import Iterable
+
+import cobra
+from cobra import Metabolite, Reaction
+
+from raven_python.manipulation.add import _new_met_id
+
+
+def _name_comp(met: Metabolite) -> str:
+ return f"{met.name}[{met.compartment}]"
+
+
+def add_reactions_from_model(
+ model: cobra.Model,
+ source_model: cobra.Model,
+ reactions: str | Iterable[str],
+ *,
+ genes: bool | str | Iterable[str] = False,
+ note: str | None = "Added via add_reactions_from_model()",
+ confidence: int | None = None,
+) -> list[Reaction]:
+ """Copy reactions from ``source_model`` into ``model``.
+ Parameters
+ ----------
+ model
+ Draft model to copy into (mutated in place).
+ source_model
+ Model to copy reactions from.
+ reactions
+ Reaction ID(s) in ``source_model``. Reactions already present in
+ ``model`` (by ID) are skipped.
+ genes
+ ``False`` (default): add reactions without GPRs. ``True``: copy each
+ reaction's GPR from the source. A string: use it as the GPR for every
+ added reaction. A list: per-reaction GPRs (matching the reactions that
+ are actually added). New genes are created automatically.
+ note
+ Stored in each added reaction's ``notes['note']`` (set ``None`` to skip).
+ confidence
+ If given, stored in each added reaction's ``notes['confidence_score']``.
+
+ Returns
+ -------
+ list of cobra.Reaction
+ The reactions added, in input order.
+ """
+ rxn_ids = [reactions] if isinstance(reactions, str) else list(reactions)
+ missing = [r for r in rxn_ids if r not in source_model.reactions]
+ if missing:
+ raise ValueError(f"Reactions not found in the source model: {missing}")
+
+ new_ids = [r for r in rxn_ids if r not in model.reactions]
+ if not new_ids:
+ raise ValueError("All reactions are already in the model.")
+ source_rxns = [source_model.reactions.get_by_id(r) for r in new_ids]
+
+ if genes is False:
+ rules = [""] * len(source_rxns)
+ elif genes is True:
+ rules = [r.gene_reaction_rule for r in source_rxns]
+ elif isinstance(genes, str):
+ rules = [genes] * len(source_rxns)
+ else:
+ rules = list(genes)
+ if len(rules) != len(source_rxns):
+ raise ValueError(
+ f"genes list has {len(rules)} rules but {len(source_rxns)} "
+ "reactions are being added."
+ )
+
+ # Match metabolites by name[comp]; create only the genuinely new ones.
+ draft_by_name = {_name_comp(m): m for m in model.metabolites}
+ new_mets: list[Metabolite] = []
+ pending: set[str] = set()
+ # Track ids minted within this batch so two source mets that share an id
+ # but differ in name[comp] don't collide when add_metabolites runs.
+ pending_ids: set[str] = set()
+ for srx in source_rxns:
+ for met in srx.metabolites:
+ key = _name_comp(met)
+ if key in draft_by_name or key in pending:
+ continue
+ pending.add(key)
+ if met.id not in model.metabolites and met.id not in pending_ids:
+ new_id = met.id
+ else:
+ # _new_met_id only knows the model; loop past in-batch hits too.
+ new_id = _new_met_id(model, "m")
+ while new_id in pending_ids:
+ n = int(new_id[1:]) + 1
+ new_id = f"m{n}"
+ while new_id in model.metabolites:
+ n += 1
+ new_id = f"m{n}"
+ pending_ids.add(new_id)
+ new_met = Metabolite(
+ new_id,
+ name=met.name,
+ compartment=met.compartment,
+ formula=met.formula,
+ charge=met.charge,
+ )
+ new_met.annotation = copy.deepcopy(met.annotation)
+ new_met.notes = copy.deepcopy(met.notes)
+ new_mets.append(new_met)
+ draft_by_name[key] = new_met
+ if new_mets:
+ model.add_metabolites(new_mets)
+
+ added: list[Reaction] = []
+ for srx, rule in zip(source_rxns, rules, strict=True):
+ rxn = Reaction(srx.id, name=srx.name)
+ rxn.bounds = srx.bounds
+ rxn.subsystem = srx.subsystem
+ model.add_reactions([rxn])
+ rxn.add_metabolites(
+ {draft_by_name[_name_comp(met)]: coef for met, coef in srx.metabolites.items()}
+ )
+ if rule:
+ rxn.gene_reaction_rule = rule
+ rxn.annotation = copy.deepcopy(srx.annotation)
+ notes = copy.deepcopy(srx.notes)
+ if note is not None:
+ notes["note"] = note
+ if confidence is not None:
+ notes["confidence_score"] = confidence
+ rxn.notes = notes
+ added.append(rxn)
+
+ return added
diff --git a/src/raven_python/manipulation/transport.py b/src/raven_python/manipulation/transport.py
new file mode 100644
index 0000000..d0c1bf1
--- /dev/null
+++ b/src/raven_python/manipulation/transport.py
@@ -0,0 +1,157 @@
+"""Add transport reactions between compartments.
+
+cobra has no transport-reaction primitive. For each metabolite this matches the
+species by *name* across compartments (the source in ``from_compartment`` and its
+same-named twin in each target compartment), optionally creating the target
+metabolite, and
+builds a ``-1 from / +1 to`` reaction with a sequential ``tr_0001`` ID.
+"""
+from __future__ import annotations
+
+import re
+import warnings
+from collections.abc import Iterable
+
+import cobra
+from cobra import Metabolite, Reaction
+
+from raven_python.manipulation.add import _new_met_id
+
+
+def _index_by_name(mets: Iterable[Metabolite], compartment: str) -> dict[str, Metabolite]:
+ """Index metabolites by name, warning when a name is duplicated.
+
+ Same-name duplicates in a single compartment are unusual but legal in cobra,
+ and the previous one-pass dict comprehension silently dropped all but one.
+ """
+ out: dict[str, list[Metabolite]] = {}
+ for m in mets:
+ out.setdefault(m.name, []).append(m)
+ chosen: dict[str, Metabolite] = {}
+ for name, group in out.items():
+ if len(group) > 1:
+ warnings.warn(
+ f"Multiple metabolites named {name!r} in compartment {compartment!r} "
+ f"({[m.id for m in group]}); using {group[0].id!r} for transport.",
+ stacklevel=3,
+ )
+ chosen[name] = group[0]
+ return chosen
+
+
+def _transport_id_factory(model: cobra.Model, prefix: str):
+ pattern = re.compile(rf"^{re.escape(prefix)}(\d+)$")
+ used = [int(m.group(1)) for r in model.reactions if (m := pattern.match(r.id))]
+ counter = max(used) + 1 if used else 1
+
+ def next_id() -> str:
+ nonlocal counter
+ while f"{prefix}{counter:04d}" in model.reactions:
+ counter += 1
+ rid = f"{prefix}{counter:04d}"
+ counter += 1
+ return rid
+
+ return next_id
+
+
+def add_transport_reactions(
+ model: cobra.Model,
+ from_compartment: str,
+ to_compartments: str | Iterable[str],
+ metabolite_names: str | Iterable[str] | None = None,
+ *,
+ reversible: bool = True,
+ only_to_existing: bool = True,
+ id_prefix: str = "tr_",
+) -> list[Reaction]:
+ """Add transport reactions from one compartment to one or more others.
+ Parameters
+ ----------
+ from_compartment
+ Source compartment id.
+ to_compartments
+ Target compartment id(s).
+ metabolite_names
+ Names of metabolites to transport. Default: every metabolite in
+ ``from_compartment``.
+ reversible
+ If True (default), bounds span the cobra configuration default
+ (reversible); otherwise lower bound 0.
+ only_to_existing
+ If True (default), only transport a metabolite into a target
+ compartment where a same-named metabolite already exists. If False,
+ create the missing target metabolite (copying name/formula/charge/
+ annotation from the source) before adding the transport.
+ id_prefix
+ Prefix for the sequential reaction IDs (``tr_0001``, ...).
+
+ Returns
+ -------
+ list of cobra.Reaction
+ The transport reactions added, in creation order.
+ """
+ # cobra's `model.compartments` only lists compartments that have metabolites;
+ # include registered-but-empty ones so transport can target an empty compartment.
+ known = set(model.compartments) | set(model._compartments)
+ if from_compartment not in known:
+ raise ValueError(f"Compartment {from_compartment!r} is not in the model.")
+ if isinstance(to_compartments, str):
+ to_compartments = [to_compartments]
+ else:
+ to_compartments = list(to_compartments)
+ for comp in to_compartments:
+ if comp not in known:
+ raise ValueError(f"Compartment {comp!r} is not in the model.")
+
+ source = _index_by_name(
+ (m for m in model.metabolites if m.compartment == from_compartment),
+ from_compartment,
+ )
+ if metabolite_names is None:
+ names = list(source)
+ else:
+ names = [metabolite_names] if isinstance(metabolite_names, str) else list(metabolite_names)
+ missing = [n for n in names if n not in source]
+ if missing:
+ raise ValueError(
+ f"Metabolites not found in compartment {from_compartment!r}: {missing}"
+ )
+
+ cfg = cobra.Configuration()
+ bounds = (cfg.lower_bound, cfg.upper_bound) if reversible else (0.0, cfg.upper_bound)
+ from_name = model.compartments.get(from_compartment) or from_compartment
+ next_id = _transport_id_factory(model, id_prefix)
+
+ added: list[Reaction] = []
+ for to_comp in to_compartments:
+ to_name = model.compartments.get(to_comp) or to_comp
+ targets = _index_by_name(
+ (m for m in model.metabolites if m.compartment == to_comp),
+ to_comp,
+ )
+ for name in names:
+ src = source[name]
+ dst = targets.get(name)
+ if dst is None:
+ if only_to_existing:
+ continue
+ dst = Metabolite(
+ _new_met_id(model, "m"),
+ name=name,
+ compartment=to_comp,
+ formula=src.formula,
+ charge=src.charge,
+ )
+ dst.annotation = dict(src.annotation)
+ model.add_metabolites([dst])
+ targets[name] = dst
+
+ rxn = Reaction(next_id())
+ rxn.name = f"{name} transport, {from_name}-{to_name}"
+ rxn.bounds = bounds
+ model.add_reactions([rxn])
+ rxn.add_metabolites({src: -1, dst: 1})
+ added.append(rxn)
+
+ return added
diff --git a/src/raven_python/utils/__init__.py b/src/raven_python/utils/__init__.py
new file mode 100644
index 0000000..7127bdd
--- /dev/null
+++ b/src/raven_python/utils/__init__.py
@@ -0,0 +1,16 @@
+"""Shared helpers — GPR linting, elemental balance, model curation checks, id sorting."""
+from raven_python.utils.balance import ElementalBalance, get_elemental_balance
+from raven_python.utils.gpr import GPRIssue, find_non_dnf_grrules, is_dnf
+from raven_python.utils.sort import sort_identifiers
+from raven_python.utils.validate import ModelIssue, check_model
+
+__all__ = [
+ "ElementalBalance",
+ "GPRIssue",
+ "ModelIssue",
+ "check_model",
+ "find_non_dnf_grrules",
+ "get_elemental_balance",
+ "is_dnf",
+ "sort_identifiers",
+]
diff --git a/src/raven_python/utils/balance.py b/src/raven_python/utils/balance.py
new file mode 100644
index 0000000..ee64ab4
--- /dev/null
+++ b/src/raven_python/utils/balance.py
@@ -0,0 +1,89 @@
+"""Check the elemental balance of reactions, distinguishing *unbalanced* from
+*unknown* (missing formula).
+
+cobra's ``reaction.check_mass_balance()`` silently treats a missing formula as
+empty, so a reaction can look "unbalanced" — or even balanced — when the truth is
+that the data is incomplete. This module checks for missing formulas first and
+returns a graded status
+per reaction (``balanced`` / ``unbalanced`` / ``unknown``) plus the element
+imbalance — over a batch, as structured data.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+import cobra
+
+
+@dataclass(frozen=True)
+class ElementalBalance:
+ """Balance result for one reaction.
+
+ Attributes
+ ----------
+ reaction_id
+ ID of the reaction.
+ status
+ ``"balanced"`` — elements balance;
+ ``"unbalanced"`` — they do not (see ``imbalance``);
+ ``"unknown"`` — at least one metabolite has no formula, so it cannot be
+ determined (cobra would silently miscount these).
+ imbalance
+ Element → net coefficient (products − reactants), only for
+ ``"unbalanced"``; empty otherwise. Charge is not included.
+ """
+
+ reaction_id: str
+ status: str
+ imbalance: dict[str, float] = field(default_factory=dict)
+
+
+def get_elemental_balance(
+ model: cobra.Model, reactions=None
+) -> list[ElementalBalance]:
+ """Check whether reactions are elementally balanced.
+ Parameters
+ ----------
+ reactions
+ Reaction IDs/objects to check; default all reactions. (Boundary
+ reactions exchange mass with the environment and will read as
+ ``unbalanced`` — filter them out if that is not wanted.)
+
+ Returns
+ -------
+ list of ElementalBalance
+ One entry per checked reaction, in model order.
+ """
+ if reactions is None:
+ rxns = list(model.reactions)
+ else:
+ if isinstance(reactions, (str, cobra.Reaction)):
+ reactions = [reactions]
+ rxns = [
+ r if isinstance(r, cobra.Reaction) else model.reactions.get_by_id(r)
+ for r in reactions
+ ]
+
+ results: list[ElementalBalance] = []
+ for rxn in rxns:
+ if not rxn.metabolites:
+ # A reaction with no metabolites used to fall through to ``balanced``
+ # (vacuously) because ``any()`` over the empty list is False and the
+ # zero-element imbalance dict is empty. Treat the no-formula case
+ # (zero formulae present) as ``unknown``: we can't determine balance
+ # for a reaction without stoichiometry.
+ results.append(ElementalBalance(rxn.id, "unknown"))
+ continue
+ if any(not met.formula for met in rxn.metabolites):
+ results.append(ElementalBalance(rxn.id, "unknown"))
+ continue
+ imbalance = {
+ element: amount
+ for element, amount in rxn.check_mass_balance().items()
+ if element != "charge"
+ }
+ if imbalance:
+ results.append(ElementalBalance(rxn.id, "unbalanced", imbalance))
+ else:
+ results.append(ElementalBalance(rxn.id, "balanced"))
+ return results
diff --git a/src/raven_python/utils/gpr.py b/src/raven_python/utils/gpr.py
new file mode 100644
index 0000000..2e2122d
--- /dev/null
+++ b/src/raven_python/utils/gpr.py
@@ -0,0 +1,119 @@
+"""GPR (gene-protein-reaction rule) linting.
+
+Flag GPRs that are *not* in disjunctive normal form ("OR of AND-complexes"), via cobra's
+GPR AST. GPR syntax *normalisation* is already done by cobra on assignment, so it isn't
+re-implemented here.
+
+Part (2) has no cobrapy equivalent and is ported here, reworked onto cobra's
+GPR AST instead of RAVEN's brittle substring search. The relevant property is
+**disjunctive normal form (DNF)**: an OR of AND-clauses of single genes, e.g.
+``(G1 and G2) or G3``. Rules where an AND contains an OR — e.g.
+``(G1 or G2) and (G3 or G4)`` — are *valid* for cobra but ambiguous for the
+isoenzyme/complex reasoning used across RAVEN/GECKO, and ``expand_model``
+(see :mod:`raven_python.manipulation.expand`) only does something for DNF rules.
+:func:`find_non_dnf_grrules` surfaces them as structured data rather than, as
+RAVEN did, only printing a warning.
+"""
+from __future__ import annotations
+
+import ast
+from dataclasses import dataclass
+
+import cobra
+from cobra.core.gene import GPR
+
+
+def _contains_or(node: ast.AST | None) -> bool:
+ """True if ``node``'s subtree contains an OR operator anywhere."""
+ if isinstance(node, ast.BoolOp):
+ if isinstance(node.op, ast.Or):
+ return True
+ return any(_contains_or(value) for value in node.values)
+ return False
+
+
+def _is_dnf_node(node: ast.AST | None) -> bool:
+ """True if the AST rooted at ``node`` is in disjunctive normal form.
+
+ DNF here means no AND operator has an OR anywhere beneath it, i.e. the
+ rule is a single gene, a pure AND-complex, or an OR of those.
+ """
+ if node is None or isinstance(node, ast.Name):
+ return True
+ if isinstance(node, ast.BoolOp):
+ if isinstance(node.op, ast.And):
+ return not any(_contains_or(value) for value in node.values)
+ # OR: every disjunct must itself be DNF
+ return all(_is_dnf_node(value) for value in node.values)
+ # Unknown node type: don't flag it as a problem.
+ return True
+
+
+def is_dnf(gpr: GPR | str | None) -> bool:
+ """Return whether a GPR is in disjunctive normal form (OR of AND-complexes).
+
+ Parameters
+ ----------
+ gpr
+ A cobra :class:`~cobra.core.gene.GPR`, a grRule string, or ``None``.
+ An empty/``None`` rule is trivially DNF.
+
+ Examples
+ --------
+ >>> is_dnf("(G1 and G2) or G3")
+ True
+ >>> is_dnf("(G1 or G2) and G3")
+ False
+ """
+ if isinstance(gpr, str):
+ gpr = GPR.from_string(gpr)
+ if gpr is None:
+ return True
+ return _is_dnf_node(gpr.body)
+
+
+@dataclass(frozen=True)
+class GPRIssue:
+ """A reaction whose GPR is flagged by the linter.
+
+ Attributes
+ ----------
+ reaction_id
+ ID of the reaction.
+ gpr
+ The (already cobra-normalised) grRule string.
+ reason
+ Human-readable explanation of why it was flagged.
+ """
+
+ reaction_id: str
+ gpr: str
+ reason: str
+
+
+_NON_DNF_REASON = (
+ "GPR is not in disjunctive normal form (an AND clause contains an OR). "
+ "Isoenzyme/complex reasoning and expand_model assume an OR of AND-complexes, "
+ 'e.g. rewrite "(G1 or G2) and (G3 or G4)" as '
+ '"(G1 and G3) or (G1 and G4) or (G2 and G3) or (G2 and G4)".'
+)
+
+
+def find_non_dnf_grrules(model: cobra.Model) -> list[GPRIssue]:
+ """Find reactions whose GPR is not in disjunctive normal form ("OR of AND-complexes").
+
+ Uses cobra's GPR AST. Reactions with no GPR are skipped.
+
+ Returns
+ -------
+ list of GPRIssue
+ One entry per flagged reaction, in model reaction order. Empty if all
+ GPRs are simple OR-of-AND-complexes.
+ """
+ issues: list[GPRIssue] = []
+ for rxn in model.reactions:
+ if not rxn.gene_reaction_rule:
+ continue
+ if not is_dnf(rxn.gpr):
+ issues.append(GPRIssue(rxn.id, rxn.gene_reaction_rule, _NON_DNF_REASON))
+ return issues
diff --git a/src/raven_python/utils/parse.py b/src/raven_python/utils/parse.py
new file mode 100644
index 0000000..8068f6c
--- /dev/null
+++ b/src/raven_python/utils/parse.py
@@ -0,0 +1,33 @@
+"""Small parsing helpers shared across raven_python."""
+from __future__ import annotations
+
+import re
+
+# A metabolite written as ``name[comp]``. The name is greedy so that, for a
+# pathological name that itself contains brackets, the *last* ``[...]`` is taken
+# as the compartment (matching RAVEN getIndexes' ``max(strfind('['))`` rule).
+_NAME_COMP_RE = re.compile(r"^(?P.+)\[(?P[^\[\]]+)\]$")
+
+
+def parse_name_comp(token: str) -> tuple[str, str | None]:
+ """Split a ``name[comp]`` token into ``(name, compartment)``.
+
+ This is the one genuinely cobra-absent sliver of RAVEN ``getIndexes``'
+ ``metcomps`` mode and ``addRxns`` eqnType 3: resolving a metabolite written
+ as its *name* plus a compartment in square brackets, e.g. ``"ATP[c]"``.
+
+ Returns ``(name, None)`` when there is no trailing ``[...]``.
+
+ Examples
+ --------
+ >>> parse_name_comp("ATP[c]")
+ ('ATP', 'c')
+ >>> parse_name_comp("ATP")
+ ('ATP', None)
+ >>> parse_name_comp("weird[name][m]")
+ ('weird[name]', 'm')
+ """
+ match = _NAME_COMP_RE.match(token.strip())
+ if match:
+ return match.group("name").strip(), match.group("comp").strip()
+ return token.strip(), None
diff --git a/src/raven_python/utils/sort.py b/src/raven_python/utils/sort.py
new file mode 100644
index 0000000..a8641a8
--- /dev/null
+++ b/src/raven_python/utils/sort.py
@@ -0,0 +1,21 @@
+"""Sort a model's identifiers alphabetically — useful for deterministic,
+diff-friendly output.
+
+cobra's ``DictList.sort`` reorders one list (and rebuilds its lookup index), but
+there is no single "sort the whole model" call; this provides it.
+"""
+from __future__ import annotations
+
+import cobra
+
+
+def sort_identifiers(model: cobra.Model) -> cobra.Model:
+ """Sort reactions, metabolites and genes alphabetically by ID, in place.
+
+ Returns the same (mutated) model for convenience. Compartments are a plain
+ dict and are emitted sorted by writers as needed.
+ """
+ model.reactions.sort(key=lambda r: r.id)
+ model.metabolites.sort(key=lambda m: m.id)
+ model.genes.sort(key=lambda g: g.id)
+ return model
diff --git a/src/raven_python/utils/validate.py b/src/raven_python/utils/validate.py
new file mode 100644
index 0000000..c08df48
--- /dev/null
+++ b/src/raven_python/utils/validate.py
@@ -0,0 +1,86 @@
+"""Curation checks for a model.
+
+A QC bundle cobra has no single call for: orphaned objects, empty reactions,
+duplicated metabolite ``name + compartment``, empty names, and objective sanity.
+:func:`check_model` returns these as structured :class:`ModelIssue` records.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import cobra
+
+
+@dataclass(frozen=True)
+class ModelIssue:
+ """One curation issue found in a model.
+
+ Attributes
+ ----------
+ category
+ Machine-readable kind, e.g. ``"orphan_metabolite"``, ``"empty_reaction"``,
+ ``"orphan_gene"``, ``"duplicate_name_compartment"``,
+ ``"empty_metabolite_name"``, ``"objective"``.
+ object_id
+ ID of the offending object, or ``None`` for model-level issues.
+ message
+ Human-readable description.
+ """
+
+ category: str
+ object_id: str | None
+ message: str
+
+
+def check_model(model: cobra.Model) -> list[ModelIssue]:
+ """Run curation checks on a model and return the issues found.
+
+ Does not
+ raise; returns a (possibly empty) list of :class:`ModelIssue`.
+ """
+ issues: list[ModelIssue] = []
+
+ for met in model.metabolites:
+ if not met.reactions:
+ issues.append(
+ ModelIssue("orphan_metabolite", met.id, f"Metabolite {met.id!r} is not used in any reaction.")
+ )
+ if not (met.name and str(met.name).strip()):
+ issues.append(
+ ModelIssue("empty_metabolite_name", met.id, f"Metabolite {met.id!r} has no name.")
+ )
+
+ for gene in model.genes:
+ if not gene.reactions:
+ issues.append(
+ ModelIssue("orphan_gene", gene.id, f"Gene {gene.id!r} is not associated with any reaction.")
+ )
+
+ for rxn in model.reactions:
+ if not rxn.metabolites:
+ issues.append(
+ ModelIssue("empty_reaction", rxn.id, f"Reaction {rxn.id!r} has no metabolites.")
+ )
+
+ by_name_comp: dict[tuple[str, str], list[str]] = {}
+ for met in model.metabolites:
+ by_name_comp.setdefault((met.name, met.compartment), []).append(met.id)
+ for (name, comp), ids in by_name_comp.items():
+ if name and len(ids) > 1:
+ issues.append(
+ ModelIssue(
+ "duplicate_name_compartment",
+ None,
+ f"{len(ids)} metabolites share name {name!r} in compartment {comp!r}: {sorted(ids)}",
+ )
+ )
+
+ objective_rxns = [r.id for r in model.reactions if r.objective_coefficient != 0]
+ if not objective_rxns:
+ issues.append(ModelIssue("objective", None, "No reaction has a nonzero objective coefficient."))
+ elif len(objective_rxns) > 1:
+ issues.append(
+ ModelIssue("objective", None, f"Multiple objective reactions: {sorted(objective_rxns)}")
+ )
+
+ return issues
diff --git a/tests/test_binaries.py b/tests/test_binaries.py
new file mode 100644
index 0000000..d74ce0b
--- /dev/null
+++ b/tests/test_binaries.py
@@ -0,0 +1,80 @@
+"""Tests for raven_python.binaries (binary resolution + bundled-ZIP provisioning)."""
+import hashlib
+import shutil
+import zipfile
+from pathlib import Path
+
+import pytest
+
+from raven_python import binaries
+
+
+def test_resolve_explicit_path():
+ assert binaries.resolve_binary("blastp", binary="/opt/x/blastp") == "/opt/x/blastp"
+
+
+def test_resolve_env_var(monkeypatch):
+ monkeypatch.setenv("RAVEN_PYTHON_DIAMOND", "/custom/diamond")
+ assert binaries.resolve_binary("diamond") == "/custom/diamond"
+
+
+@pytest.mark.skipif(not shutil.which("blastp"), reason="blastp not installed")
+def test_resolve_via_path():
+ assert binaries.resolve_binary("blastp") == shutil.which("blastp")
+
+
+def test_resolve_unresolvable_raises(monkeypatch):
+ monkeypatch.setattr(shutil, "which", lambda _: None)
+ with pytest.raises(FileNotFoundError, match="Could not find"):
+ binaries.resolve_binary("diamond") # empty registry, not on PATH
+
+
+def test_platform_key_format():
+ key = binaries.platform_key()
+ assert "-" in key
+ os_part, arch = key.split("-", 1)
+ assert os_part in {"linux", "macos", "windows"} or os_part # tolerant
+
+
+def test_ensure_binary_downloads_verifies_extracts(tmp_path, monkeypatch):
+ # Build a fake bundle ZIP containing an executable, served via file:// URL.
+ exe = tmp_path / "footool"
+ exe.write_text("#!/bin/sh\necho hi\n")
+ archive = tmp_path / "footool.zip"
+ with zipfile.ZipFile(archive, "w") as zf:
+ zf.write(exe, "footool")
+ sha = hashlib.sha256(archive.read_bytes()).hexdigest()
+
+ registry = {
+ "footool": {
+ "version": "1.0",
+ "provides": ["footool"],
+ "platforms": {binaries.platform_key(): {"url": archive.as_uri(), "sha256": sha}},
+ }
+ }
+ monkeypatch.setenv("XDG_CACHE_HOME", str(tmp_path / "cache"))
+
+ path = binaries.ensure_binary("footool", registry=registry)
+ assert Path(path).exists()
+ assert Path(path).name == "footool"
+ # cached on second call (same path, no re-download needed)
+ assert binaries.ensure_binary("footool", registry=registry) == path
+
+
+def test_ensure_binary_sha_mismatch(tmp_path, monkeypatch):
+ archive = tmp_path / "x.zip"
+ with zipfile.ZipFile(archive, "w") as zf:
+ zf.writestr("footool", "data")
+ registry = {
+ "footool": {"version": "1", "provides": ["footool"],
+ "platforms": {binaries.platform_key(): {"url": archive.as_uri(), "sha256": "deadbeef"}}}
+ }
+ monkeypatch.setenv("XDG_CACHE_HOME", str(tmp_path / "cache"))
+ with pytest.raises(ValueError, match="SHA256 mismatch"):
+ binaries.ensure_binary("footool", registry=registry)
+
+
+def test_ensure_binary_unhosted_platform_raises(tmp_path):
+ registry = {"footool": {"version": "1", "provides": ["footool"], "platforms": {}}}
+ with pytest.raises(FileNotFoundError, match="No bundled"):
+ binaries.ensure_binary("footool", registry=registry)
diff --git a/tests/test_change_grrules.py b/tests/test_change_grrules.py
new file mode 100644
index 0000000..d33f723
--- /dev/null
+++ b/tests/test_change_grrules.py
@@ -0,0 +1,49 @@
+"""Tests for change_gene_reaction_rules (changeGrRules port)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import add_reactions_from_equations, change_gene_reaction_rules
+
+
+@pytest.fixture
+def model():
+ m = cobra.Model("t")
+ m.add_metabolites(
+ [cobra.Metabolite("a_c", compartment="c"), cobra.Metabolite("b_c", compartment="c")]
+ )
+ add_reactions_from_equations(
+ m,
+ [
+ {"id": "R1", "equation": "a_c --> b_c", "gene_reaction_rule": "G1"},
+ {"id": "R2", "equation": "a_c --> b_c"},
+ ],
+ )
+ return m
+
+
+def test_replace_rule_and_create_genes(model):
+ (rxn,) = change_gene_reaction_rules(model, {"R1": "G2 and G3"})
+ assert rxn.gene_reaction_rule == "G2 and G3"
+ assert {g.id for g in rxn.genes} == {"G2", "G3"}
+ assert {"G2", "G3"} <= {g.id for g in model.genes}
+
+
+def test_append_rule(model):
+ change_gene_reaction_rules(model, {"R1": "G4"}, replace=False)
+ # (G1) or (G4), normalised by cobra
+ assert model.reactions.get_by_id("R1").gene_reaction_rule == "G1 or G4"
+
+
+def test_append_when_empty_is_just_new(model):
+ change_gene_reaction_rules(model, {"R2": "G5"}, replace=False)
+ assert model.reactions.get_by_id("R2").gene_reaction_rule == "G5"
+
+
+def test_batch(model):
+ changed = change_gene_reaction_rules(model, {"R1": "GA", "R2": "GB"})
+ assert [r.id for r in changed] == ["R1", "R2"]
+
+
+def test_unknown_reaction_errors(model):
+ with pytest.raises(ValueError, match="not found"):
+ change_gene_reaction_rules(model, {"NOPE": "G1"})
diff --git a/tests/test_data.py b/tests/test_data.py
new file mode 100644
index 0000000..714c3a9
--- /dev/null
+++ b/tests/test_data.py
@@ -0,0 +1,89 @@
+"""Tests for ensure_data (data.py). Uses file:// URLs to avoid the network."""
+import hashlib
+
+import pytest
+
+from raven_python.data import (
+ CORE_KEGG_FILES,
+ ensure_data_file,
+ ensure_kegg_data,
+)
+
+
+def _sha256(data: bytes) -> str:
+ return hashlib.sha256(data).hexdigest()
+
+
+@pytest.fixture
+def served(tmp_path, monkeypatch):
+ """A fake registry served from local files, with the cache pointed at tmp."""
+ src = tmp_path / "src"
+ src.mkdir()
+ payloads = {
+ "reference_model.yml.gz": b"!!omap model bytes",
+ "ko_reaction.tsv.gz": b"ko\treaction\n",
+ "ko_names.tsv.gz": b"ko\tname\n",
+ "organism_gene_ko.tsv.xz": b"organism\tgene\tko\n",
+ "rxn_flags.tsv.gz": b"reaction\tspontaneous\n",
+ }
+ files = {}
+ for name, data in payloads.items():
+ path = src / name
+ path.write_bytes(data)
+ files[name] = {"url": path.as_uri(), "sha256": _sha256(data)}
+ registry = {"kegg": {"version": "v1", "files": files}}
+
+ cache = tmp_path / "cache"
+ monkeypatch.setenv("XDG_CACHE_HOME", str(cache))
+ return registry, cache, payloads
+
+
+def test_ensure_data_file_downloads_and_caches(served):
+ registry, cache, payloads = served
+ path = ensure_data_file("kegg", "ko_reaction.tsv.gz", registry=registry)
+ assert path == cache / "raven_python" / "data" / "kegg-v1" / "ko_reaction.tsv.gz"
+ assert path.read_bytes() == payloads["ko_reaction.tsv.gz"]
+
+
+def test_ensure_data_file_reuses_cache(served, monkeypatch):
+ registry, _, _ = served
+ first = ensure_data_file("kegg", "ko_names.tsv.gz", registry=registry)
+ # Break the URL: a second call must hit the cache, not re-download.
+ registry["kegg"]["files"]["ko_names.tsv.gz"]["url"] = "file:///nonexistent"
+ second = ensure_data_file("kegg", "ko_names.tsv.gz", registry=registry)
+ assert first == second and second.exists()
+
+
+def test_sha256_mismatch_rejected(served):
+ registry, cache, _ = served
+ registry["kegg"]["files"]["rxn_flags.tsv.gz"]["sha256"] = "0" * 64
+ with pytest.raises(ValueError, match="SHA256 mismatch"):
+ ensure_data_file("kegg", "rxn_flags.tsv.gz", registry=registry)
+ # The corrupt partial download must not be left behind.
+ assert not (cache / "raven_python" / "data" / "kegg-v1" / "rxn_flags.tsv.gz").exists()
+
+
+def test_unknown_dataset_actionable_error(served):
+ registry, _, _ = served
+ with pytest.raises(FileNotFoundError, match="No data artefacts registered"):
+ ensure_data_file("metacyc", "x", registry=registry)
+
+
+def test_unknown_file_lists_available(served):
+ registry, _, _ = served
+ with pytest.raises(FileNotFoundError, match="not registered"):
+ ensure_data_file("kegg", "missing.tsv.gz", registry=registry)
+
+
+def test_ensure_kegg_data_fetches_core_set(served):
+ registry, cache, _ = served
+ out = ensure_kegg_data(registry=registry)
+ assert out == cache / "raven_python" / "data" / "kegg-v1"
+ for name in CORE_KEGG_FILES:
+ assert (out / name).is_file()
+
+
+def test_empty_registry_raises():
+ # The shipped registry is empty until artefacts are published.
+ with pytest.raises(FileNotFoundError, match="No data artefacts registered"):
+ ensure_data_file("kegg", "ko_reaction.tsv.gz")
diff --git a/tests/test_io_excel.py b/tests/test_io_excel.py
new file mode 100644
index 0000000..12434ef
--- /dev/null
+++ b/tests/test_io_excel.py
@@ -0,0 +1,111 @@
+"""Tests for raven_python.io.excel (exportToExcelFormat port, export only)."""
+import cobra
+import pytest
+
+openpyxl = pytest.importorskip("openpyxl")
+
+from raven_python.io import export_to_excel
+from raven_python.manipulation import add_reactions_from_equations
+
+
+@pytest.fixture
+def model():
+ m = cobra.Model("yeastGEM")
+ m.name = "Yeast"
+ m.compartments = {"c": "cytoplasm"}
+ m.notes["metaData"] = {"taxonomy": "taxonomy/559292", "defaultLB": "-1000"}
+ m.add_metabolites(
+ [
+ cobra.Metabolite("atp_c", name="ATP", formula="C10H16N5O13P3", charge=-4, compartment="c"),
+ cobra.Metabolite("adp_c", name="ADP", compartment="c"),
+ ]
+ )
+ m.metabolites.atp_c.annotation = {"kegg.compound": ["C00002"], "smiles": ["C1=NC"]}
+ m.metabolites.atp_c.notes = {"inchis": "InChI=1S/X"}
+ add_reactions_from_equations(
+ m,
+ [{"id": "R1", "equation": "atp_c <=> adp_c", "name": "rxn one",
+ "gene_reaction_rule": "G1", "subsystem": "glycolysis"}],
+ )
+ r = m.reactions.R1
+ r.annotation = {"ec-code": ["1.1.1.1"], "kegg.reaction": ["R00001"]}
+ r.notes = {"confidence_score": 2, "note": "a note", "references": "PMID:1"}
+ r.objective_coefficient = 1
+ return m
+
+
+def _wb(path):
+ return openpyxl.load_workbook(path)
+
+
+def test_sheets_present(model, tmp_path):
+ out = tmp_path / "m.xlsx"
+ export_to_excel(model, out)
+ wb = _wb(out)
+ assert set(wb.sheetnames) == {"RXNS", "METS", "COMPS", "GENES", "MODEL"}
+
+
+def test_rxns_sheet(model, tmp_path):
+ out = tmp_path / "m.xlsx"
+ export_to_excel(model, out)
+ ws = _wb(out)["RXNS"]
+ header = [c.value for c in ws[1]]
+ row = {header[i]: c.value for i, c in enumerate(ws[2])}
+ assert row["ID"] == "R1"
+ assert row["NAME"] == "rxn one"
+ assert "ATP[c]" in row["EQUATION"] and "<=>" in row["EQUATION"]
+ assert row["EC-NUMBER"] == "1.1.1.1"
+ assert row["GENE ASSOCIATION"] == "G1"
+ assert row["SUBSYSTEM"] == "glycolysis"
+ assert row["OBJECTIVE"] == 1
+ assert row["CONFIDENCE SCORE"] == 2
+ assert row["NOTE"] == "a note"
+ assert row["MIRIAM"] == "kegg.reaction/R00001" # ec-code excluded (own column)
+
+
+def test_mets_sheet(model, tmp_path):
+ out = tmp_path / "m.xlsx"
+ export_to_excel(model, out)
+ ws = _wb(out)["METS"]
+ header = [c.value for c in ws[1]]
+ rows = {
+ r[header.index("REPLACEMENT ID")].value: {header[i]: c.value for i, c in enumerate(r)}
+ for r in ws.iter_rows(min_row=2)
+ }
+ atp = rows["atp_c"]
+ assert atp["ID"] == "ATP[c]"
+ assert atp["NAME"] == "ATP"
+ assert atp["InChI"] == "InChI=1S/X"
+ assert atp["COMPOSITION"] is None # suppressed when InChI present
+ assert atp["CHARGE"] == -4
+ assert atp["MIRIAM"] == "kegg.compound/C00002" # smiles excluded
+
+
+def test_model_sheet(model, tmp_path):
+ out = tmp_path / "m.xlsx"
+ export_to_excel(model, out)
+ ws = _wb(out)["MODEL"]
+ header = [c.value for c in ws[1]]
+ row = {header[i]: c.value for i, c in enumerate(ws[2])}
+ assert row["ID"] == "yeastGEM"
+ assert row["NAME"] == "Yeast"
+ assert row["TAXONOMY"] == "taxonomy/559292"
+ assert row["DEFAULT LOWER"] == "-1000"
+
+
+def test_genes_sheet(model, tmp_path):
+ out = tmp_path / "m.xlsx"
+ export_to_excel(model, out)
+ ws = _wb(out)["GENES"]
+ header = [c.value for c in ws[1]]
+ row = {header[i]: c.value for i, c in enumerate(ws[2])}
+ assert row["NAME"] == "G1"
+
+
+def test_no_genes_skips_sheet(tmp_path):
+ m = cobra.Model("t")
+ m.add_metabolites([cobra.Metabolite("a_c", compartment="c")])
+ add_reactions_from_equations(m, [{"id": "R1", "equation": "a_c -->"}])
+ out = tmp_path / "m.xlsx"
+ export_to_excel(m, out)
+ assert "GENES" not in _wb(out).sheetnames
diff --git a/tests/test_io_git.py b/tests/test_io_git.py
new file mode 100644
index 0000000..28881dc
--- /dev/null
+++ b/tests/test_io_git.py
@@ -0,0 +1,69 @@
+"""Tests for raven_python.io.git (exportForGit port)."""
+import cobra
+import pytest
+
+from raven_python.io import export_for_git
+from raven_python.manipulation import add_reactions_from_equations
+
+
+@pytest.fixture
+def model():
+ m = cobra.Model("yeastGEM")
+ m.compartments = {"c": "cytoplasm"}
+ m.add_metabolites(
+ [cobra.Metabolite("atp_c", name="ATP", compartment="c"),
+ cobra.Metabolite("adp_c", name="ADP", compartment="c")]
+ )
+ add_reactions_from_equations(m, [{"id": "R1", "equation": "atp_c <=> adp_c"}])
+ return m
+
+
+def test_standard_gem_layout(model, tmp_path):
+ root = export_for_git(model, tmp_path, prefix="yeast", formats=("yml", "xml", "mat", "xlsx", "txt"))
+ assert root == tmp_path / "model"
+ assert (root / "yml" / "yeast.yml").exists()
+ assert (root / "xml" / "yeast.xml").exists()
+ assert (root / "mat" / "yeast.mat").exists()
+ assert (root / "xlsx" / "yeast.xlsx").exists()
+ assert (root / "txt" / "yeast.txt").exists()
+ assert (root / "dependencies.txt").exists()
+
+
+def test_dependencies_file(model, tmp_path):
+ root = export_for_git(model, tmp_path, formats=("yml",))
+ deps = (root / "dependencies.txt").read_text()
+ assert "python\t" in deps
+ assert "cobra\t" in deps
+ assert "raven_python\t" in deps
+
+
+def test_flat_layout(model, tmp_path):
+ root = export_for_git(model, tmp_path, formats=("yml",), sub_dirs=False)
+ assert root == tmp_path
+ assert (tmp_path / "model.yml").exists()
+
+
+def test_subset_of_formats(model, tmp_path):
+ root = export_for_git(model, tmp_path, formats=("yml", "xml"))
+ assert (root / "yml" / "model.yml").exists()
+ assert not (root / "mat").exists()
+ assert not (root / "xlsx").exists()
+
+
+def test_does_not_mutate_model(model, tmp_path):
+ order_before = [r.id for r in model.reactions]
+ export_for_git(model, tmp_path, formats=("yml",))
+ assert [r.id for r in model.reactions] == order_before
+
+
+def test_txt_table_content(model, tmp_path):
+ root = export_for_git(model, tmp_path, formats=("txt",))
+ txt = (root / "txt" / "model.txt").read_text()
+ assert txt.splitlines()[0].startswith("Rxn name\t")
+ assert "R1" in txt
+ assert "ATP[c]" in txt
+
+
+def test_bad_format(model, tmp_path):
+ with pytest.raises(ValueError, match="Unknown format"):
+ export_for_git(model, tmp_path, formats=("yml", "json"))
diff --git a/tests/test_io_sif.py b/tests/test_io_sif.py
new file mode 100644
index 0000000..d50ad98
--- /dev/null
+++ b/tests/test_io_sif.py
@@ -0,0 +1,82 @@
+"""Tests for raven_python.io.sif (exportModelToSIF port)."""
+import cobra
+import pytest
+
+from raven_python.io import export_model_to_sif
+from raven_python.manipulation import add_reactions_from_equations
+
+
+@pytest.fixture
+def model():
+ m = cobra.Model("t")
+ m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b", "c")])
+ add_reactions_from_equations(
+ m,
+ [
+ {"id": "R1", "equation": "a --> b"},
+ {"id": "R2", "equation": "b --> c"},
+ ],
+ )
+ return m
+
+
+def _lines(path):
+ return [ln.split("\t") for ln in path.read_text().splitlines()]
+
+
+def test_reaction_compound(model, tmp_path):
+ out = tmp_path / "g.sif"
+ export_model_to_sif(model, out, "rc")
+ rows = {r[0]: (r[1], set(r[2:])) for r in _lines(out)}
+ assert rows["R1"] == ("rc", {"a", "b"})
+ assert rows["R2"] == ("rc", {"b", "c"})
+
+
+def test_reaction_reaction(model, tmp_path):
+ out = tmp_path / "g.sif"
+ export_model_to_sif(model, out, "rr")
+ rows = {r[0]: set(r[2:]) for r in _lines(out)}
+ # R1 and R2 share metabolite b
+ assert rows["R1"] == {"R2"}
+ assert rows["R2"] == {"R1"}
+
+
+def test_compound_compound(model, tmp_path):
+ out = tmp_path / "g.sif"
+ export_model_to_sif(model, out, "cc")
+ rows = {r[0]: set(r[2:]) for r in _lines(out)}
+ # a is a substrate of R1 (a->b): a links to product b
+ assert "b" in rows.get("a", set())
+ # b is substrate of R2 (b->c): b links to c
+ assert "c" in rows.get("b", set())
+
+
+def test_custom_labels(model, tmp_path):
+ out = tmp_path / "g.sif"
+ export_model_to_sif(model, out, "rc", reaction_labels={"R1": "Reaction1"})
+ sources = {r[0] for r in _lines(out)}
+ assert "Reaction1" in sources
+ assert "R1" not in sources
+
+
+def test_bad_graph_type(model, tmp_path):
+ with pytest.raises(ValueError, match="graph_type"):
+ export_model_to_sif(model, tmp_path / "g.sif", "xx")
+
+
+def test_cc_does_not_mutate_input(model, tmp_path):
+ n_before = len(model.reactions)
+ export_model_to_sif(model, tmp_path / "g.sif", "cc")
+ assert len(model.reactions) == n_before # convert_to_irreversible ran on a copy
+
+
+# --- regression: label-map collision (known_issues.md B4) ------------------
+
+def test_collapsing_label_map_warns(model, tmp_path):
+ """A label map that sends two distinct ids to the same label silently merges
+ nodes during the target-side dedup. Now warns so the user sees it."""
+ with pytest.warns(UserWarning, match="multiple ids to the same label"):
+ export_model_to_sif(
+ model, tmp_path / "g.sif", "rc",
+ reaction_labels={"R1": "shared", "R2": "shared"},
+ )
diff --git a/tests/test_io_yaml.py b/tests/test_io_yaml.py
new file mode 100644
index 0000000..510af5f
--- /dev/null
+++ b/tests/test_io_yaml.py
@@ -0,0 +1,202 @@
+"""Tests for raven_python.io.yaml against the RAVEN fa281a1 (cobra-native !!omap) schema."""
+from pathlib import Path
+
+import cobra
+import pytest
+from cobra.io.yaml import yaml as cobra_yaml
+
+from raven_python.io import read_yaml_model, write_yaml_model
+
+# A model laid out exactly as RAVEN writeYAMLmodel (fa281a1) emits: cobra-native
+# structure, RAVEN-only fields as top-level per-entry keys, smiles/ec-code inside
+# the annotation block, metaData provenance-only, id/name/version top-level.
+RAVEN_DOC = {
+ "metabolites": [
+ {
+ "id": "s_0001",
+ "name": "ATP",
+ "compartment": "c",
+ "formula": "C10H16N5O13P3",
+ "charge": -4,
+ "inchis": "InChI=1S/CH4",
+ "deltaG": 12.5,
+ "notes": "a metabolite note",
+ "metFrom": "KEGG",
+ "annotation": {"kegg.compound": ["C00002"], "smiles": ["C1=NC2"]},
+ },
+ {"id": "s_0002", "name": "ADP", "compartment": "c"},
+ ],
+ "reactions": [
+ {
+ "id": "R1",
+ "name": "rxn one",
+ "metabolites": {"s_0001": -1, "s_0002": 1},
+ "lower_bound": -1000.0,
+ "upper_bound": 1000.0,
+ "gene_reaction_rule": "G1",
+ "subsystem": "glycolysis",
+ "confidence_score": 2,
+ "references": "PMID:123",
+ "rxnFrom": "manual",
+ "notes": "a reaction note",
+ "deltaG": -5.0,
+ "annotation": {"ec-code": ["1.1.1.1"]},
+ }
+ ],
+ "genes": [
+ {"id": "G1", "name": "gene one", "protein": "P12345", "annotation": {"uniprot": ["P12345"]}}
+ ],
+ "id": "testModel",
+ "name": "Test Model",
+ "compartments": {"c": "cytoplasm"},
+ "version": "1.0",
+ "metaData": {"date": "2026-05-23", "taxonomy": "taxonomy/559292", "defaultLB": "-1000"},
+ "ec-rxns": [{"id": "R1", "kcat": 100.0}],
+}
+
+
+@pytest.fixture
+def yaml_file(tmp_path) -> Path:
+ p = tmp_path / "model.yml"
+ with open(p, "w", encoding="utf-8") as fh:
+ cobra_yaml.dump(RAVEN_DOC, fh)
+ return p
+
+
+def test_standard_content(yaml_file):
+ model = read_yaml_model(yaml_file)
+ assert model.id == "testModel"
+ assert model.name == "Test Model"
+ assert {m.id for m in model.metabolites} == {"s_0001", "s_0002"}
+ r = model.reactions.get_by_id("R1")
+ assert r.bounds == (-1000.0, 1000.0)
+ assert r.subsystem == "glycolysis"
+ assert r.gene_reaction_rule == "G1"
+
+
+def test_annotation_owned_by_cobra(yaml_file):
+ # smiles / ec-code / miriam live in the annotation block (cobra reads them)
+ model = read_yaml_model(yaml_file)
+ assert model.metabolites.get_by_id("s_0001").annotation["smiles"] == ["C1=NC2"]
+ assert model.metabolites.get_by_id("s_0001").annotation["kegg.compound"] == ["C00002"]
+ assert model.reactions.get_by_id("R1").annotation["ec-code"] == ["1.1.1.1"]
+ assert model.genes.get_by_id("G1").annotation["uniprot"] == ["P12345"]
+
+
+def test_raven_only_fields_captured(yaml_file):
+ model = read_yaml_model(yaml_file)
+ a = model.metabolites.get_by_id("s_0001")
+ assert a.notes["inchis"] == "InChI=1S/CH4"
+ assert a.notes["deltaG"] == 12.5
+ assert a.notes["note"] == "a metabolite note" # RAVEN metNotes string, no crash
+ assert a.notes["metFrom"] == "KEGG"
+ assert "smiles" not in a.notes # smiles stays in annotation
+ r = model.reactions.get_by_id("R1")
+ assert r.notes["confidence_score"] == 2
+ assert r.notes["references"] == "PMID:123"
+ assert r.notes["rxnFrom"] == "manual"
+ assert r.notes["note"] == "a reaction note"
+ assert r.notes["deltaG"] == -5.0
+ assert model.genes.get_by_id("G1").notes["protein"] == "P12345"
+
+
+def test_model_level_extras(yaml_file):
+ model = read_yaml_model(yaml_file)
+ assert model.notes["metaData"]["taxonomy"] == "taxonomy/559292"
+ assert model.notes["version"] == "1.0"
+ assert model.notes["_yaml_sections"]["ec-rxns"][0]["kcat"] == 100.0
+
+
+def test_round_trip(yaml_file, tmp_path):
+ model = read_yaml_model(yaml_file)
+ out = tmp_path / "out.yml"
+ write_yaml_model(model, out)
+ reloaded = read_yaml_model(out)
+
+ assert reloaded.id == "testModel"
+ assert reloaded.notes["version"] == "1.0"
+ assert reloaded.notes["metaData"]["taxonomy"] == "taxonomy/559292"
+ a = reloaded.metabolites.get_by_id("s_0001")
+ assert a.notes["deltaG"] == 12.5
+ assert a.notes["note"] == "a metabolite note"
+ assert a.annotation["smiles"] == ["C1=NC2"]
+ r = reloaded.reactions.get_by_id("R1")
+ assert r.notes["confidence_score"] == 2
+ assert reloaded.genes.get_by_id("G1").notes["protein"] == "P12345"
+ assert reloaded.notes["_yaml_sections"]["ec-rxns"][0]["id"] == "R1"
+
+
+def test_extra_notes_not_dropped_when_free_text_note_present(yaml_file, tmp_path):
+ """An entry with both a RAVEN free-text note and an extra note keeps both on write."""
+ model = read_yaml_model(yaml_file)
+ a = model.metabolites.get_by_id("s_0001")
+ a.notes["note"] = "free text"
+ a.notes["custom"] = "extra value" # a non-RAVEN note that must not be silently lost
+ out = tmp_path / "out.yml"
+ write_yaml_model(model, out)
+ text = out.read_text()
+ assert "extra value" in text # the leftover note survives serialization
+
+
+def test_gzipped_round_trip(yaml_file, tmp_path):
+ # A .yml.gz path is transparently gzipped on write and read.
+ model = read_yaml_model(yaml_file)
+ out = tmp_path / "out.yml.gz"
+ write_yaml_model(model, out)
+ assert out.read_bytes()[:2] == b"\x1f\x8b" # gzip magic
+ reloaded = read_yaml_model(out)
+ assert reloaded.id == "testModel"
+ assert {m.id for m in reloaded.metabolites} == {"s_0001", "s_0002"}
+
+
+def test_output_is_cobra_readable(yaml_file, tmp_path):
+ # The written file must load with stock cobra (it's cobra's native format).
+ model = read_yaml_model(yaml_file)
+ out = tmp_path / "out.yml"
+ write_yaml_model(model, out)
+ cobra_model = cobra.io.load_yaml_model(str(out))
+ assert cobra_model.id == "testModel"
+ assert {m.id for m in cobra_model.metabolites} == {"s_0001", "s_0002"}
+ # RAVEN-only fields land in cobra notes; smiles in annotation
+ assert cobra_model.metabolites.get_by_id("s_0001").annotation["smiles"] == ["C1=NC2"]
+
+
+def test_write_emits_raven_top_level_keys(yaml_file, tmp_path):
+ model = read_yaml_model(yaml_file)
+ out = tmp_path / "out.yml"
+ write_yaml_model(model, out)
+ text = out.read_text()
+ # RAVEN-only fields are lifted back to top-level entry keys, not buried in notes
+ assert "inchis:" in text
+ assert "deltaG:" in text
+ assert "confidence_score:" in text
+ assert "metaData:" in text
+
+
+def test_legacy_id_in_metadata(tmp_path):
+ # Older RAVEN files nest id/name under metaData and have no top-level id.
+ legacy = {
+ "metabolites": [{"id": "a_c", "name": "A", "compartment": "c"}],
+ "reactions": [],
+ "genes": [],
+ "compartments": {"c": "cyt"},
+ "metaData": {"id": "legacyModel", "name": "Legacy"},
+ }
+ p = tmp_path / "legacy.yml"
+ with open(p, "w", encoding="utf-8") as fh:
+ cobra_yaml.dump(legacy, fh)
+ model = read_yaml_model(p)
+ assert model.id == "legacyModel"
+ assert model.name == "Legacy"
+
+
+# Optional smoke test against a real model file if present.
+_YEAST = Path("/home/eduardk/github/GECKO/tutorials/full_ecModel/models/yeast-GEM.yml")
+
+
+@pytest.mark.skipif(not _YEAST.exists(), reason="real yeast-GEM.yml not available")
+def test_real_yeast_gem_loads():
+ model = read_yaml_model(_YEAST)
+ assert len(model.reactions) > 1000
+ # legacy file: identity comes from metaData
+ assert model.id
diff --git a/tests/test_manipulation_add.py b/tests/test_manipulation_add.py
new file mode 100644
index 0000000..2a3a9d3
--- /dev/null
+++ b/tests/test_manipulation_add.py
@@ -0,0 +1,278 @@
+"""Tests for raven_python.manipulation.add (addRxns port)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import add_reactions_from_equations
+from raven_python.utils.parse import parse_name_comp
+
+
+@pytest.fixture
+def model():
+ m = cobra.Model("t")
+ m.add_metabolites(
+ [
+ cobra.Metabolite("atp_c", name="ATP", compartment="c"),
+ cobra.Metabolite("h2o_c", name="H2O", compartment="c"),
+ cobra.Metabolite("adp_c", name="ADP", compartment="c"),
+ cobra.Metabolite("pi_c", name="phosphate", compartment="c"),
+ ]
+ )
+ return m
+
+
+# --- parse_name_comp -------------------------------------------------------
+
+@pytest.mark.parametrize(
+ "token,expected",
+ [
+ ("ATP[c]", ("ATP", "c")),
+ ("ATP", ("ATP", None)),
+ (" ATP[c] ", ("ATP", "c")),
+ ("weird[name][m]", ("weird[name]", "m")),
+ ],
+)
+def test_parse_name_comp(token, expected):
+ assert parse_name_comp(token) == expected
+
+
+# --- id mode (eqnType 1) ---------------------------------------------------
+
+def test_add_by_id_basic_and_reversibility(model):
+ (rxn,) = add_reactions_from_equations(
+ model, [{"id": "R1", "equation": "atp_c + h2o_c <=> adp_c + pi_c"}]
+ )
+ assert rxn.id == "R1"
+ assert rxn.reversibility is True
+ assert {m.id: rxn.get_coefficient(m.id) for m in rxn.metabolites} == {
+ "atp_c": -1.0,
+ "h2o_c": -1.0,
+ "adp_c": 1.0,
+ "pi_c": 1.0,
+ }
+
+
+def test_irreversible_arrows(model):
+ rxns = add_reactions_from_equations(
+ model,
+ [
+ {"id": "R1", "equation": "atp_c --> adp_c"},
+ {"id": "R2", "equation": "atp_c => adp_c"},
+ ],
+ )
+ for r in rxns:
+ assert r.lower_bound == 0.0
+ assert r.reversibility is False
+
+
+def test_coefficients(model):
+ (rxn,) = add_reactions_from_equations(
+ model, [{"id": "R1", "equation": "2 atp_c + 1.5 h2o_c --> adp_c"}]
+ )
+ assert rxn.get_coefficient("atp_c") == -2.0
+ assert rxn.get_coefficient("h2o_c") == -1.5
+
+
+def test_id_mode_creates_new_met_in_compartment(model):
+ add_reactions_from_equations(
+ model,
+ [{"id": "R1", "equation": "atp_c --> amp_c"}],
+ compartment="c",
+ )
+ assert "amp_c" in model.metabolites
+ assert model.metabolites.get_by_id("amp_c").compartment == "c"
+
+
+def test_id_mode_new_met_without_compartment_errors(model):
+ with pytest.raises(ValueError, match="no compartment"):
+ add_reactions_from_equations(model, [{"id": "R1", "equation": "atp_c --> amp_c"}])
+
+
+# --- name mode (eqnType 2) -------------------------------------------------
+
+def test_name_mode_matches_existing_by_name(model):
+ (rxn,) = add_reactions_from_equations(
+ model,
+ [{"id": "R1", "equation": "ATP + H2O <=> ADP + phosphate"}],
+ mets_by="name",
+ compartment="c",
+ )
+ # resolved to the existing _c metabolites, not new ones
+ assert {m.id for m in rxn.metabolites} == {"atp_c", "h2o_c", "adp_c", "pi_c"}
+ assert len(model.metabolites) == 4
+
+
+def test_name_mode_creates_new_met_with_auto_id(model):
+ add_reactions_from_equations(
+ model,
+ [{"id": "R1", "equation": "ATP --> AMP"}],
+ mets_by="name",
+ compartment="c",
+ )
+ new = [m for m in model.metabolites if m.name == "AMP"]
+ assert len(new) == 1
+ assert new[0].id == "m1"
+ assert new[0].compartment == "c"
+
+
+def test_name_mode_requires_compartment(model):
+ with pytest.raises(ValueError, match="needs a compartment"):
+ add_reactions_from_equations(
+ model, [{"id": "R1", "equation": "ATP --> ADP"}], mets_by="name"
+ )
+
+
+# --- name[comp] mode (eqnType 3) -------------------------------------------
+
+def test_name_comp_syntax(model):
+ model.add_metabolites([cobra.Metabolite("atp_m", name="ATP", compartment="m")])
+ (rxn,) = add_reactions_from_equations(
+ model,
+ [{"id": "R1", "equation": "ATP[c] --> ATP[m]"}],
+ mets_by="name",
+ compartment="c",
+ )
+ # matched ATP in two different compartments by name[comp]
+ assert {m.id for m in rxn.metabolites} == {"atp_c", "atp_m"}
+
+
+# --- genes -----------------------------------------------------------------
+
+def test_gene_rule_auto_creates_genes(model):
+ (rxn,) = add_reactions_from_equations(
+ model,
+ [{"id": "R1", "equation": "atp_c --> adp_c", "gene_reaction_rule": "G1 and G2"}],
+ )
+ assert {g.id for g in rxn.genes} == {"G1", "G2"}
+ assert {g.id for g in model.genes} == {"G1", "G2"}
+
+
+def test_strict_genes_errors_on_unknown(model):
+ with pytest.raises(ValueError, match="genes not in the model"):
+ add_reactions_from_equations(
+ model,
+ [{"id": "R1", "equation": "atp_c --> adp_c", "gene_reaction_rule": "G1"}],
+ allow_new_genes=False,
+ )
+
+
+def test_strict_genes_ok_when_present(model):
+ model.genes.append(cobra.core.gene.Gene("G1"))
+ (rxn,) = add_reactions_from_equations(
+ model,
+ [{"id": "R1", "equation": "atp_c --> adp_c", "gene_reaction_rule": "G1"}],
+ allow_new_genes=False,
+ )
+ assert rxn.gene_reaction_rule == "G1"
+
+
+# --- guards & extras -------------------------------------------------------
+
+def test_duplicate_reaction_id_errors(model):
+ model.add_reactions([cobra.Reaction("R1")])
+ with pytest.raises(ValueError, match="already exists"):
+ add_reactions_from_equations(model, [{"id": "R1", "equation": "atp_c --> adp_c"}])
+
+
+def test_strict_mets_errors(model):
+ with pytest.raises(ValueError, match="allow_new_mets"):
+ add_reactions_from_equations(
+ model,
+ [{"id": "R1", "equation": "atp_c --> amp_c"}],
+ compartment="c",
+ allow_new_mets=False,
+ )
+
+
+def test_explicit_bounds_override_arrow(model):
+ (rxn,) = add_reactions_from_equations(
+ model,
+ [{"id": "R1", "equation": "atp_c <=> adp_c", "bounds": (0, 50), "name": "myrxn"}],
+ )
+ assert rxn.bounds == (0, 50)
+ assert rxn.name == "myrxn"
+
+
+def test_net_zero_metabolite_dropped(model):
+ # atp_c on both sides nets to zero and is removed.
+ (rxn,) = add_reactions_from_equations(
+ model, [{"id": "R1", "equation": "atp_c + h2o_c --> atp_c + adp_c"}]
+ )
+ assert "atp_c" not in {m.id for m in rxn.metabolites}
+ assert {m.id for m in rxn.metabolites} == {"h2o_c", "adp_c"}
+
+
+def test_missing_equation_errors(model):
+ with pytest.raises(ValueError, match="missing required 'equation'"):
+ add_reactions_from_equations(model, [{"id": "R1"}])
+
+
+def test_no_arrow_errors(model):
+ with pytest.raises(ValueError, match="No reaction arrow"):
+ add_reactions_from_equations(model, [{"id": "R1", "equation": "atp_c + h2o_c"}])
+
+
+# --- regression: leading-number metabolite name (known_issues.md A1) -------
+
+def test_name_mode_preserves_leading_number_name(model):
+ """A metabolite name that begins with a number isn't misparsed as a coefficient.
+
+ Before the fix the token ``"2 oxoglutarate"`` was parsed as ``(coeff=2, name="oxoglutarate")``
+ silently — corrupting the stoichiometry. The resolver now prefers the full
+ token when it matches an existing metabolite name.
+ """
+ model.add_metabolites([
+ cobra.Metabolite("akg_c", name="2 oxoglutarate", compartment="c"),
+ ])
+ (rxn,) = add_reactions_from_equations(
+ model,
+ [{"id": "R1", "equation": "ATP + 2 oxoglutarate --> ADP"}],
+ mets_by="name",
+ compartment="c",
+ )
+ assert rxn.get_coefficient("akg_c") == -1.0 # not -2.0
+ assert rxn.get_coefficient("atp_c") == -1.0
+
+
+def test_name_mode_coefficient_still_works_without_collision(model):
+ """If the full token doesn't match anything, fall back to coefficient split."""
+ (rxn,) = add_reactions_from_equations(
+ model,
+ [{"id": "R1", "equation": "2 ATP + H2O --> ADP + phosphate"}],
+ mets_by="name",
+ compartment="c",
+ )
+ assert rxn.get_coefficient("atp_c") == -2.0
+
+
+# --- regression: empty-stoichiometry warning (known_issues.md A2) ----------
+
+def test_empty_stoichiometry_warns(model):
+ """All-terms-cancel reaction warns instead of silently shipping an empty rxn."""
+ with pytest.warns(UserWarning, match="no net metabolites"):
+ (rxn,) = add_reactions_from_equations(
+ model, [{"id": "R1", "equation": "atp_c --> atp_c"}]
+ )
+ assert len(rxn.metabolites) == 0
+
+
+# --- regression: unknown-compartment warning (known_issues.md B2) ----------
+
+def test_id_mode_unknown_compartment_warns(model):
+ """A typo'd compartment used to silently produce a one-met ghost compartment
+ in id mode (the name/[comp] path used to validate, id mode never did)."""
+ with pytest.warns(UserWarning, match="unregistered compartment 'cyto'"):
+ add_reactions_from_equations(
+ model,
+ [{"id": "R1", "equation": "atp_c --> amp_c"}],
+ compartment="cyto", # typo for 'c'
+ )
+
+
+def test_name_comp_unknown_compartment_warns(model):
+ """Same defensive check in the name[comp] path when allow_new_mets=True."""
+ with pytest.warns(UserWarning, match="unregistered compartment 'mito'"):
+ add_reactions_from_equations(
+ model,
+ [{"id": "R1", "equation": "ATP[c] --> AMP[mito]"}],
+ mets_by="name",
+ )
diff --git a/tests/test_manipulation_change.py b/tests/test_manipulation_change.py
new file mode 100644
index 0000000..8d54f58
--- /dev/null
+++ b/tests/test_manipulation_change.py
@@ -0,0 +1,93 @@
+"""Tests for raven_python.manipulation.change (changeRxns port)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import add_reactions_from_equations, change_reaction_equations
+
+
+@pytest.fixture
+def model():
+ m = cobra.Model("t")
+ m.add_metabolites(
+ [
+ cobra.Metabolite("a_c", name="A", compartment="c"),
+ cobra.Metabolite("b_c", name="B", compartment="c"),
+ cobra.Metabolite("c_c", name="C", compartment="c"),
+ ]
+ )
+ add_reactions_from_equations(
+ m,
+ [
+ {
+ "id": "R1",
+ "equation": "a_c <=> b_c",
+ "name": "first",
+ "bounds": (-30, 70),
+ "gene_reaction_rule": "G1 or G2",
+ "subsystem": "sub",
+ },
+ {"id": "R2", "equation": "a_c --> c_c"},
+ ],
+ )
+ return m
+
+
+def test_changes_stoichiometry(model):
+ (rxn,) = change_reaction_equations(model, {"R1": "a_c --> 2 c_c"})
+ assert rxn.id == "R1"
+ assert {m.id: rxn.get_coefficient(m.id) for m in rxn.metabolites} == {
+ "a_c": -1.0,
+ "c_c": 2.0,
+ }
+
+
+def test_preserves_other_fields(model):
+ before = model.reactions.get_by_id("R1")
+ name, bounds, subsystem = before.name, before.bounds, before.subsystem
+ genes = {g.id for g in before.genes}
+
+ change_reaction_equations(model, {"R1": "a_c --> c_c"})
+
+ after = model.reactions.get_by_id("R1")
+ assert after.name == name
+ assert after.bounds == bounds # bounds untouched, per RAVEN
+ assert after.subsystem == subsystem
+ assert {g.id for g in after.genes} == genes
+
+
+def test_preserves_reaction_order(model):
+ order_before = [r.id for r in model.reactions]
+ change_reaction_equations(model, {"R1": "b_c --> c_c"})
+ assert [r.id for r in model.reactions] == order_before
+
+
+def test_bounds_not_changed_by_arrow(model):
+ # R1 starts reversible (-30, 70); a --> arrow must NOT make it irreversible.
+ change_reaction_equations(model, {"R1": "a_c --> b_c"})
+ assert model.reactions.get_by_id("R1").bounds == (-30, 70)
+
+
+def test_name_mode(model):
+ (rxn,) = change_reaction_equations(
+ model, {"R2": "A --> C"}, mets_by="name", compartment="c"
+ )
+ assert {m.id for m in rxn.metabolites} == {"a_c", "c_c"}
+
+
+def test_can_introduce_new_met(model):
+ change_reaction_equations(
+ model, {"R2": "a_c --> d_c"}, compartment="c"
+ )
+ assert "d_c" in model.metabolites
+ assert model.reactions.get_by_id("R2").get_coefficient("d_c") == 1.0
+
+
+def test_unknown_reaction_errors(model):
+ with pytest.raises(ValueError, match="not found"):
+ change_reaction_equations(model, {"NOPE": "a_c --> b_c"})
+
+
+def test_multiple_reactions(model):
+ changed = change_reaction_equations(model, {"R1": "a_c --> c_c", "R2": "b_c --> c_c"})
+ assert [r.id for r in changed] == ["R1", "R2"]
+ assert model.reactions.get_by_id("R2").get_coefficient("b_c") == -1.0
diff --git a/tests/test_manipulation_compartments.py b/tests/test_manipulation_compartments.py
new file mode 100644
index 0000000..4d3fb3b
--- /dev/null
+++ b/tests/test_manipulation_compartments.py
@@ -0,0 +1,139 @@
+"""Tests for manipulation/compartments.py — merge_compartments + copy_to_compartment."""
+from __future__ import annotations
+
+import cobra
+import pytest
+
+from raven_python.manipulation.compartments import copy_to_compartment, merge_compartments
+
+
+def _two_compartment_model() -> cobra.Model:
+ """A_c → B_c, A_m → B_m, and a transport A_c ↔ A_m. Multi-compartment toy."""
+ m = cobra.Model("toy")
+ A_c = cobra.Metabolite("A_c", name="A", compartment="c")
+ A_m = cobra.Metabolite("A_m", name="A", compartment="m")
+ B_c = cobra.Metabolite("B_c", name="B", compartment="c")
+ B_m = cobra.Metabolite("B_m", name="B", compartment="m")
+ m.add_metabolites([A_c, A_m, B_c, B_m])
+
+ def rxn(rid, lb, ub, mets, gpr=None):
+ r = cobra.Reaction(rid, lower_bound=lb, upper_bound=ub)
+ r.add_metabolites(mets)
+ if gpr:
+ r.gene_reaction_rule = gpr
+ return r
+ m.add_reactions([rxn("r_c", 0, 1000, {A_c: -1, B_c: 1}, "g1"),
+ rxn("r_m", 0, 1000, {A_m: -1, B_m: 1}, "g2"),
+ rxn("tr_A", -1000, 1000, {A_c: -1, A_m: 1})])
+ return m
+
+
+# ----------------------------------------------------------------- merge_compartments
+
+def test_merge_compartments_collapses_to_one():
+ """A_c + A_m → A; B_c + B_m → B; transport A_c↔A_m self-cancels and is dropped."""
+ m = _two_compartment_model()
+ merged, deleted, dupes = merge_compartments(m)
+ # Only the base ids survive.
+ assert {x.id for x in merged.metabolites} == {"A", "B"}
+ # The transport reaction collapsed (A → A) and was deleted.
+ assert "tr_A" in deleted
+ # r_c and r_m are now both A → B; one of them gets deduplicated.
+ surviving = {r.id for r in merged.reactions}
+ assert len(surviving & {"r_c", "r_m"}) == 1
+ assert (set(dupes) | (surviving & {"r_c", "r_m"})) == {"r_c", "r_m"}
+
+
+def test_merge_compartments_preserves_gpr_and_subsystem():
+ m = _two_compartment_model()
+ m.reactions.r_c.subsystem = "carbo"
+ merged, _, _ = merge_compartments(m)
+ survivor = next(r for r in merged.reactions if r.id in {"r_c", "r_m"})
+ # The survivor keeps its gene rule + subsystem (cobra may sometimes lose them
+ # through copy; we set them explicitly).
+ assert survivor.gene_reaction_rule in {"g1", "g2"}
+ if survivor.id == "r_c":
+ assert survivor.subsystem == "carbo"
+
+
+def test_merge_compartments_keeps_single_met_reactions_when_asked():
+ """drop_single_metabolite_reactions=False keeps the collapsed transport (now A → A,
+ which is empty stoichiometry after net-cancellation — still dropped, but the *one-met*
+ case is the more interesting one). Use a uniport pattern to exercise it."""
+ m = cobra.Model("uniport")
+ A_c = cobra.Metabolite("A_c", name="A", compartment="c")
+ A_m = cobra.Metabolite("A_m", name="A", compartment="m")
+ H_c = cobra.Metabolite("H_c", name="H", compartment="c")
+ m.add_metabolites([A_c, A_m, H_c])
+ # H+ symport: A_c + H_c → A_m. After merge: A + H → A → leaves H.
+ sym = cobra.Reaction("sym", lower_bound=0, upper_bound=1000)
+ sym.add_metabolites({A_c: -1, H_c: -1, A_m: 1})
+ m.add_reactions([sym])
+ merged_drop, deleted_drop, _ = merge_compartments(m, drop_single_metabolite_reactions=True)
+ assert "sym" in deleted_drop
+ merged_keep, deleted_keep, _ = merge_compartments(m, drop_single_metabolite_reactions=False)
+ # With keep, sym survives as a one-met reaction (consumes H).
+ assert "sym" not in deleted_keep
+ assert "sym" in {r.id for r in merged_keep.reactions}
+
+
+def test_merge_compartments_deduplicate_off_keeps_both():
+ m = _two_compartment_model()
+ merged, _, dupes = merge_compartments(m, deduplicate_reactions=False)
+ assert dupes == []
+ assert {"r_c", "r_m"} <= {r.id for r in merged.reactions}
+
+
+# ----------------------------------------------------------------- copy_to_compartment
+
+def test_copy_to_compartment_basic():
+ """Copy r_c into 'p' (peroxisome): a new reaction r_c_p with metabolites in p."""
+ m = _two_compartment_model()
+ out, new_rxns, new_mets = copy_to_compartment(m, ["r_c"], "p",
+ target_compartment_name="peroxisome")
+ assert "r_c_p" in [r.id for r in out.reactions]
+ new_r = out.reactions.r_c_p
+ assert {x.compartment for x in new_r.metabolites} == {"p"}
+ assert "A_p" in [x.id for x in out.metabolites]
+ assert "B_p" in [x.id for x in out.metabolites]
+ assert new_rxns == ["r_c_p"]
+ assert set(new_mets) == {"A_p", "B_p"}
+ # Original still there.
+ assert "r_c" in [r.id for r in out.reactions]
+
+
+def test_copy_to_compartment_preserves_gpr_and_bounds():
+ m = _two_compartment_model()
+ out, _, _ = copy_to_compartment(m, ["r_c"], "p")
+ new_r = out.reactions.r_c_p
+ assert new_r.gene_reaction_rule == "g1"
+ assert new_r.lower_bound == 0 and new_r.upper_bound == 1000
+
+
+def test_copy_to_compartment_delete_original_is_a_move():
+ m = _two_compartment_model()
+ out, _, _ = copy_to_compartment(m, ["r_c"], "p", delete_original=True)
+ assert "r_c" not in [r.id for r in out.reactions]
+ assert "r_c_p" in [r.id for r in out.reactions]
+
+
+def test_copy_to_compartment_idempotent():
+ """Calling twice doesn't add the reaction twice."""
+ m = _two_compartment_model()
+ out, _, _ = copy_to_compartment(m, ["r_c"], "p")
+ out2, new_rxns, _ = copy_to_compartment(out, ["r_c"], "p")
+ assert new_rxns == [] # nothing added on second call
+ assert len([r for r in out2.reactions if r.id == "r_c_p"]) == 1
+
+
+def test_copy_to_compartment_unknown_reaction_raises():
+ m = _two_compartment_model()
+ with pytest.raises(ValueError, match="not in model"):
+ copy_to_compartment(m, ["does_not_exist"], "p")
+
+
+def test_copy_to_compartment_custom_suffix():
+ m = _two_compartment_model()
+ out, new_rxns, _ = copy_to_compartment(m, ["r_c"], "p", id_suffix="copy1")
+ assert new_rxns == ["r_c_copy1"]
+ assert "A_copy1" in [x.id for x in out.metabolites]
diff --git a/tests/test_manipulation_expand.py b/tests/test_manipulation_expand.py
new file mode 100644
index 0000000..08cd2f2
--- /dev/null
+++ b/tests/test_manipulation_expand.py
@@ -0,0 +1,288 @@
+"""Tests for expand_model (RAVEN expandModel.m) — splitting isozymes into reactions.
+
+Adopted from geckopy's tests/test_expand.py.
+"""
+import cobra
+
+from raven_python.manipulation import expand_model
+from raven_python.manipulation.expand import _gpr_to_dnf
+
+# --------------------------------------------------------------------------- #
+# DNF conversion (internal helper, worth testing directly)
+# --------------------------------------------------------------------------- #
+
+def _dnf_from_gpr_string(gpr_str: str) -> list[list[str]]:
+ from cobra.core.gene import GPR
+
+ gpr = GPR.from_string(gpr_str)
+ return _gpr_to_dnf(gpr)
+
+
+def test_dnf_empty_gpr():
+ assert _dnf_from_gpr_string("") == []
+
+
+def test_dnf_single_gene():
+ assert _dnf_from_gpr_string("g1") == [["g1"]]
+
+
+def test_dnf_simple_and():
+ assert _dnf_from_gpr_string("g1 and g2") == [["g1", "g2"]]
+
+
+def test_dnf_simple_or():
+ assert _dnf_from_gpr_string("g1 or g2") == [["g1"], ["g2"]]
+
+
+def test_dnf_or_of_ands():
+ assert _dnf_from_gpr_string("(g1 and g2) or (g3 and g4)") == [
+ ["g1", "g2"],
+ ["g3", "g4"],
+ ]
+
+
+def test_dnf_distributes_and_over_or():
+ result = _dnf_from_gpr_string("g1 and (g2 or g3)")
+ assert result == [["g1", "g2"], ["g1", "g3"]]
+
+
+def test_dnf_triple_or():
+ assert _dnf_from_gpr_string("g1 or g2 or g3") == [
+ ["g1"], ["g2"], ["g3"],
+ ]
+
+
+def test_dnf_preserves_gene_order_within_clause():
+ result = _dnf_from_gpr_string("g3 and g1 and g2")
+ assert result == [["g3", "g1", "g2"]]
+
+
+# --------------------------------------------------------------------------- #
+# expand_model
+# --------------------------------------------------------------------------- #
+
+def _build_model(
+ reactions: list[tuple[str, dict[str, float], float, float, str]],
+) -> cobra.Model:
+ """Build from (rxn_id, {met_id: coef}, lb, ub, gpr) tuples."""
+ model = cobra.Model("test")
+ mets: dict[str, cobra.Metabolite] = {}
+ for _, stoich, _, _, _ in reactions:
+ for met_id in stoich:
+ if met_id not in mets:
+ mets[met_id] = cobra.Metabolite(met_id, compartment="c")
+
+ for rxn_id, stoich, lb, ub, gpr in reactions:
+ rxn = cobra.Reaction(rxn_id)
+ rxn.lower_bound = lb
+ rxn.upper_bound = ub
+ rxn.add_metabolites({mets[m]: c for m, c in stoich.items()})
+ if gpr:
+ rxn.gene_reaction_rule = gpr
+ model.add_reactions([rxn])
+ return model
+
+
+def test_does_not_expand_reaction_without_gpr():
+ model = _build_model([("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "")])
+ added = expand_model(model)
+ assert added == []
+ assert "r1" in {r.id for r in model.reactions}
+
+
+def test_does_not_expand_single_and_clause():
+ model = _build_model([
+ ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 and g2"),
+ ])
+ added = expand_model(model)
+ assert added == []
+ r1 = model.reactions.get_by_id("r1")
+ assert r1.gene_reaction_rule == "g1 and g2"
+
+
+def test_does_not_expand_single_gene():
+ model = _build_model([("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1")])
+ added = expand_model(model)
+ assert added == []
+ assert model.reactions.get_by_id("r1").gene_reaction_rule == "g1"
+
+
+def test_splits_simple_or_into_two_reactions():
+ model = _build_model([
+ ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"),
+ ])
+ added = expand_model(model)
+
+ assert added == ["r1_EXP_1", "r1_EXP_2"]
+ rxn_ids = {r.id for r in model.reactions}
+ assert "r1" not in rxn_ids
+ assert "r1_EXP_1" in rxn_ids
+ assert "r1_EXP_2" in rxn_ids
+
+ assert model.reactions.get_by_id("r1_EXP_1").gene_reaction_rule == "g1"
+ assert model.reactions.get_by_id("r1_EXP_2").gene_reaction_rule == "g2"
+
+
+def test_splits_or_of_ands():
+ model = _build_model([
+ ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0,
+ "(g1 and g2) or (g3 and g4)"),
+ ])
+ added = expand_model(model)
+
+ assert added == ["r1_EXP_1", "r1_EXP_2"]
+ assert model.reactions.get_by_id("r1_EXP_1").gene_reaction_rule == "g1 and g2"
+ assert model.reactions.get_by_id("r1_EXP_2").gene_reaction_rule == "g3 and g4"
+
+
+def test_distributes_and_over_or():
+ model = _build_model([
+ ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0,
+ "g1 and (g2 or g3)"),
+ ])
+ added = expand_model(model)
+
+ assert added == ["r1_EXP_1", "r1_EXP_2"]
+ assert model.reactions.get_by_id("r1_EXP_1").gene_reaction_rule == "g1 and g2"
+ assert model.reactions.get_by_id("r1_EXP_2").gene_reaction_rule == "g1 and g3"
+
+
+def test_expanded_reactions_inherit_stoichiometry_and_bounds():
+ model = _build_model([
+ ("r1", {"A": -1.0, "B": 2.0}, -500.0, 1500.0, "g1 or g2"),
+ ])
+ expand_model(model)
+
+ for suffix in ("_EXP_1", "_EXP_2"):
+ rxn = model.reactions.get_by_id(f"r1{suffix}")
+ assert rxn.bounds == (-500.0, 1500.0)
+ stoich = {m.id: c for m, c in rxn.metabolites.items()}
+ assert stoich == {"A": -1.0, "B": 2.0}
+
+
+def test_expanded_reactions_inherit_name_and_subsystem():
+ model = _build_model([
+ ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"),
+ ])
+ r1 = model.reactions.get_by_id("r1")
+ r1.name = "an isozyme-catalyzed reaction"
+ r1.subsystem = "central metabolism"
+
+ expand_model(model)
+
+ for suffix in ("_EXP_1", "_EXP_2"):
+ rxn = model.reactions.get_by_id(f"r1{suffix}")
+ assert rxn.name == "an isozyme-catalyzed reaction"
+ assert rxn.subsystem == "central metabolism"
+
+
+def test_multiple_reactions_expand_independently():
+ model = _build_model([
+ ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"),
+ ("r2", {"B": -1.0, "C": 1.0}, 0.0, 1000.0, "g3 and g4"),
+ ("r3", {"C": -1.0, "D": 1.0}, 0.0, 1000.0,
+ "(g5 and g6) or g7 or (g8 and g9)"),
+ ])
+ added = expand_model(model)
+
+ assert added == sorted([
+ "r1_EXP_1", "r1_EXP_2",
+ "r3_EXP_1", "r3_EXP_2", "r3_EXP_3",
+ ])
+
+ rxn_ids = {r.id for r in model.reactions}
+ assert "r2" in rxn_ids
+ assert "r1" not in rxn_ids
+ assert "r3" not in rxn_ids
+
+ assert model.reactions.get_by_id("r2").gene_reaction_rule == "g3 and g4"
+ assert model.reactions.get_by_id("r3_EXP_1").gene_reaction_rule == "g5 and g6"
+ assert model.reactions.get_by_id("r3_EXP_2").gene_reaction_rule == "g7"
+ assert model.reactions.get_by_id("r3_EXP_3").gene_reaction_rule == "g8 and g9"
+
+
+def test_expanded_reaction_has_correct_gene_set():
+ model = _build_model([
+ ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0,
+ "(g1 and g2) or (g3 and g4)"),
+ ])
+ expand_model(model)
+
+ r1_1 = model.reactions.get_by_id("r1_EXP_1")
+ assert {g.id for g in r1_1.genes} == {"g1", "g2"}
+
+ r1_2 = model.reactions.get_by_id("r1_EXP_2")
+ assert {g.id for g in r1_2.genes} == {"g3", "g4"}
+
+
+def test_expansion_is_idempotent_in_the_no_op_sense():
+ model = _build_model([
+ ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"),
+ ("r2", {"B": -1.0, "C": 1.0}, 0.0, 1000.0, "g3 and g4"),
+ ])
+ expand_model(model)
+ ids_before = {r.id for r in model.reactions}
+
+ second = expand_model(model)
+ assert second == []
+
+ ids_after = {r.id for r in model.reactions}
+ assert ids_after == ids_before
+
+
+def test_empty_model_is_unchanged():
+ model = cobra.Model("empty")
+ assert expand_model(model) == []
+
+
+# --------------------------------------------------------------------------- #
+# Annotation and notes propagation
+# --------------------------------------------------------------------------- #
+
+def test_expanded_reactions_inherit_annotation_and_notes():
+ model = _build_model([
+ ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"),
+ ])
+ r1 = model.reactions.get_by_id("r1")
+ r1.annotation["ec-code"] = "1.2.3.4"
+ r1.annotation["sbo"] = "SBO:0000176"
+ r1.notes["custom"] = "hello"
+
+ expand_model(model)
+
+ for suffix in ("_EXP_1", "_EXP_2"):
+ rxn = model.reactions.get_by_id(f"r1{suffix}")
+ assert rxn.annotation["ec-code"] == "1.2.3.4"
+ assert rxn.annotation["sbo"] == "SBO:0000176"
+ assert rxn.notes["custom"] == "hello"
+
+
+def test_expanded_reaction_annotation_is_independent_of_parent():
+ """Mutating one expanded reaction's annotation must not affect siblings."""
+ model = _build_model([
+ ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"),
+ ])
+ model.reactions.get_by_id("r1").annotation["ec-code"] = ["1.2.3.4"]
+
+ expand_model(model)
+
+ r1_1 = model.reactions.get_by_id("r1_EXP_1")
+ r1_2 = model.reactions.get_by_id("r1_EXP_2")
+ r1_1.annotation["ec-code"].append("9.9.9.9")
+ assert r1_2.annotation["ec-code"] == ["1.2.3.4"]
+
+
+def test_objective_coefficient_preserved_on_expansion():
+ """An expanded reaction's isozyme copies retain the original objective coefficient."""
+ m = cobra.Model("o")
+ a, b = (cobra.Metabolite(x, compartment="c") for x in "ab")
+ m.add_metabolites([a, b])
+ r = cobra.Reaction("r1", lower_bound=0, upper_bound=1000)
+ r.add_metabolites({a: -1, b: 1})
+ r.gene_reaction_rule = "g1 or g2"
+ m.add_reactions([r])
+ m.objective = "r1" # objective on the soon-to-be-expanded reaction
+
+ expand_model(m)
+ coeffs = {rx.id: rx.objective_coefficient for rx in m.reactions}
+ assert coeffs == {"r1_EXP_1": 1.0, "r1_EXP_2": 1.0} # objective survives on both copies
diff --git a/tests/test_manipulation_irreversible.py b/tests/test_manipulation_irreversible.py
new file mode 100644
index 0000000..e211fa3
--- /dev/null
+++ b/tests/test_manipulation_irreversible.py
@@ -0,0 +1,144 @@
+"""Tests for convert_to_irreversible (RAVEN convertToIrrev.m).
+
+Adopted from geckopy's tests/test_preprocess.py (the convert_to_irreversible subset).
+Exchange reactions are excluded from the split, matching MATLAB behavior.
+"""
+import cobra
+
+from raven_python.manipulation import convert_to_irreversible
+
+
+def _build_model_with_bounds(
+ reactions: list[tuple[str, dict[str, float], float, float]],
+) -> cobra.Model:
+ """Build from (rxn_id, {met_id: coef}, lb, ub) tuples."""
+ model = cobra.Model("test")
+ mets: dict[str, cobra.Metabolite] = {}
+ for _, stoich, _, _ in reactions:
+ for met_id in stoich:
+ if met_id not in mets:
+ mets[met_id] = cobra.Metabolite(met_id, compartment="c")
+
+ for rxn_id, stoich, lb, ub in reactions:
+ rxn = cobra.Reaction(rxn_id)
+ rxn.lower_bound = lb
+ rxn.upper_bound = ub
+ rxn.add_metabolites({mets[m]: c for m, c in stoich.items()})
+ model.add_reactions([rxn])
+ return model
+
+
+def test_splits_single_reversible_non_exchange():
+ model = _build_model_with_bounds([
+ ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0),
+ ])
+
+ added = convert_to_irreversible(model)
+ assert added == ["r1_REV"]
+
+ fwd = model.reactions.get_by_id("r1")
+ rev = model.reactions.get_by_id("r1_REV")
+
+ assert fwd.bounds == (0.0, 1000.0)
+ assert {m.id: c for m, c in fwd.metabolites.items()} == {"A": -1.0, "B": 1.0}
+
+ assert rev.bounds == (0.0, 500.0)
+ assert {m.id: c for m, c in rev.metabolites.items()} == {"A": 1.0, "B": -1.0}
+
+
+def test_does_not_split_forward_only_reaction():
+ model = _build_model_with_bounds([
+ ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0),
+ ])
+ added = convert_to_irreversible(model)
+ assert added == []
+ assert "r1_REV" not in {r.id for r in model.reactions}
+
+
+def test_does_not_split_exchange_reaction_even_if_reversible():
+ """Exchange reactions (one metabolite) are explicitly excluded from
+ the irreversibility step in MATLAB, regardless of bounds."""
+ model = _build_model_with_bounds([
+ ("EX_A", {"A": -1.0}, -1000.0, 1000.0),
+ ])
+ added = convert_to_irreversible(model)
+ assert added == []
+ ex = model.reactions.get_by_id("EX_A")
+ assert ex.bounds == (-1000.0, 1000.0)
+
+
+def test_splits_multiple_mixed_reactions():
+ model = _build_model_with_bounds([
+ ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0), # split
+ ("r2", {"B": -2.0, "C": 3.0}, 0.0, 1000.0), # forward only
+ ("EX_A", {"A": -1.0}, -1000.0, 1000.0), # exchange
+ ("r3", {"C": -1.0, "D": 1.0}, -200.0, 200.0), # split
+ ])
+
+ added = convert_to_irreversible(model)
+ assert added == ["r1_REV", "r3_REV"]
+
+ assert model.reactions.get_by_id("r1").bounds == (0.0, 1000.0)
+ assert model.reactions.get_by_id("r1_REV").bounds == (0.0, 500.0)
+ assert model.reactions.get_by_id("r2").bounds == (0.0, 1000.0)
+ assert model.reactions.get_by_id("EX_A").bounds == (-1000.0, 1000.0)
+ assert model.reactions.get_by_id("r3").bounds == (0.0, 200.0)
+ assert model.reactions.get_by_id("r3_REV").bounds == (0.0, 200.0)
+
+
+def test_reverse_reaction_inherits_gpr():
+ model = _build_model_with_bounds([
+ ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0),
+ ])
+ model.reactions.get_by_id("r1").gene_reaction_rule = "g1 and g2"
+
+ convert_to_irreversible(model)
+
+ rev = model.reactions.get_by_id("r1_REV")
+ assert rev.gene_reaction_rule == "g1 and g2"
+ assert {g.id for g in rev.genes} == {"g1", "g2"}
+
+
+def test_forward_reaction_lb_is_clamped_to_zero():
+ """After splitting, the original reaction should have lb = 0,
+ which is what MATLAB's convertToIrrev does."""
+ model = _build_model_with_bounds([
+ ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0),
+ ])
+ convert_to_irreversible(model)
+ assert model.reactions.get_by_id("r1").lower_bound == 0.0
+
+
+def test_no_reverse_reaction_has_negative_bound():
+ """After conversion, no non-exchange reaction may carry negative flux."""
+ model = _build_model_with_bounds([
+ ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0),
+ ("r2", {"B": -1.0, "C": 1.0}, -1000.0, 0.0), # blocked reverse
+ ("EX_A", {"A": -1.0}, -1000.0, 1000.0),
+ ])
+ convert_to_irreversible(model)
+ for rxn in model.reactions:
+ if rxn.boundary:
+ continue
+ assert rxn.lower_bound >= 0, f"{rxn.id} still has lb < 0"
+
+
+def test_returns_empty_list_when_nothing_to_split():
+ model = _build_model_with_bounds([
+ ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0),
+ ("EX_A", {"A": -1.0}, -1000.0, 1000.0),
+ ])
+ assert convert_to_irreversible(model) == []
+
+
+def test_conversion_is_idempotent_after_first_pass():
+ """Running convert_to_irreversible twice should not create
+ `_REV_REV` reactions, because the first pass already clamped
+ all non-exchange lb to 0."""
+ model = _build_model_with_bounds([
+ ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0),
+ ])
+ convert_to_irreversible(model)
+ second = convert_to_irreversible(model)
+ assert second == []
+ assert "r1_REV_REV" not in {r.id for r in model.reactions}
diff --git a/tests/test_manipulation_merge.py b/tests/test_manipulation_merge.py
new file mode 100644
index 0000000..a430f6e
--- /dev/null
+++ b/tests/test_manipulation_merge.py
@@ -0,0 +1,136 @@
+"""Tests for merge_models (mergeModels port)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import add_reactions_from_equations, merge_models
+
+
+def _model(mid, mets, reactions):
+ m = cobra.Model(mid)
+ m.add_metabolites(mets)
+ add_reactions_from_equations(m, reactions)
+ return m
+
+
+@pytest.fixture
+def model_a():
+ return _model(
+ "A",
+ [
+ cobra.Metabolite("glc_c", name="Glucose", compartment="c"),
+ cobra.Metabolite("g6p_c", name="G6P", compartment="c"),
+ ],
+ [{"id": "HEX", "equation": "glc_c --> g6p_c", "gene_reaction_rule": "GA"}],
+ )
+
+
+@pytest.fixture
+def model_b():
+ # same Glucose[c] compound but a DIFFERENT id
+ return _model(
+ "B",
+ [
+ cobra.Metabolite("glucose_c", name="Glucose", compartment="c"),
+ cobra.Metabolite("lac_c", name="Lactate", compartment="c"),
+ ],
+ [{"id": "LDH", "equation": "glucose_c --> lac_c", "gene_reaction_rule": "GB"}],
+ )
+
+
+def test_unifies_metabolites_by_name_comp(model_a, model_b):
+ merged = merge_models([model_a, model_b])
+ glucoses = [m for m in merged.metabolites if m.name == "Glucose" and m.compartment == "c"]
+ assert len(glucoses) == 1 # glc_c and glucose_c unified
+ # both reactions reference the same merged Glucose object
+ hex_glc = [m for m in merged.reactions.get_by_id("HEX").metabolites if m.name == "Glucose"][0]
+ ldh_glc = [m for m in merged.reactions.get_by_id("LDH").metabolites if m.name == "Glucose"][0]
+ assert hex_glc is ldh_glc
+
+
+def test_match_by_id_keeps_distinct(model_a, model_b):
+ merged = merge_models([model_a, model_b], match_by="id")
+ glucoses = [m for m in merged.metabolites if m.name == "Glucose"]
+ assert len(glucoses) == 2 # glc_c and glucose_c are distinct by id
+
+
+def test_all_reactions_kept(model_a, model_b):
+ merged = merge_models([model_a, model_b])
+ assert {"HEX", "LDH"} <= {r.id for r in merged.reactions}
+
+
+def test_reaction_id_collision_renamed(model_a):
+ # two models with the same reaction id but different chemistry
+ other = _model(
+ "B",
+ [cobra.Metabolite("glc_c", name="Glucose", compartment="c"),
+ cobra.Metabolite("x_c", name="X", compartment="c")],
+ [{"id": "HEX", "equation": "glc_c --> x_c"}],
+ )
+ merged = merge_models([model_a, other])
+ assert "HEX" in {r.id for r in merged.reactions}
+ assert "HEX_B" in {r.id for r in merged.reactions} # renamed with source id
+
+
+def test_genes_merged(model_a, model_b):
+ merged = merge_models([model_a, model_b])
+ assert {"GA", "GB"} <= {g.id for g in merged.genes}
+
+
+def test_provenance_recorded(model_a, model_b):
+ merged = merge_models([model_a, model_b])
+ assert merged.reactions.get_by_id("HEX").notes["origin"] == "A"
+ assert merged.reactions.get_by_id("LDH").notes["origin"] == "B"
+ assert merged.genes.get_by_id("GA").notes["origin"] == "A"
+
+
+def test_compartments_preserved(model_a):
+ model_a.compartments = {"c": "cytoplasm"}
+ merged = merge_models([model_a, model_a.copy()])
+ assert merged.compartments.get("c") == "cytoplasm"
+
+
+def test_single_model_returns_copy(model_a):
+ merged = merge_models([model_a])
+ assert merged is not model_a
+ assert {r.id for r in merged.reactions} == {r.id for r in model_a.reactions}
+
+
+def test_three_models(model_a, model_b):
+ c = _model("C", [cobra.Metabolite("co2_c", name="CO2", compartment="c")],
+ [{"id": "SINK", "equation": "co2_c -->"}])
+ merged = merge_models([model_a, model_b, c])
+ assert {"HEX", "LDH", "SINK"} <= {r.id for r in merged.reactions}
+
+
+def test_bad_match_by(model_a, model_b):
+ with pytest.raises(ValueError, match="match_by"):
+ merge_models([model_a, model_b], match_by="oops")
+
+
+# --- regression: formula/charge conflict (known_issues.md B1) --------------
+
+def test_formula_conflict_warns():
+ """Two models sharing a name[comp] but with different formulas warn instead
+ of silently keeping the first."""
+ a = _model("A",
+ [cobra.Metabolite("g1", name="Glucose", formula="C6H12O6", compartment="c")],
+ [{"id": "EX_A", "equation": "g1 -->"}])
+ b = _model("B",
+ [cobra.Metabolite("g2", name="Glucose", formula="C6H12O7", compartment="c")],
+ [{"id": "EX_B", "equation": "g2 -->"}])
+ with pytest.warns(UserWarning, match="different formulas"):
+ merged = merge_models([a, b])
+ # The merge still picks the first-seen — the test asserts the warning fired
+ # and the model survives.
+ assert "EX_A" in merged.reactions and "EX_B" in merged.reactions
+
+
+def test_charge_conflict_warns():
+ a = _model("A",
+ [cobra.Metabolite("g1", name="Glucose", formula="C6H12O6", charge=0, compartment="c")],
+ [{"id": "EX_A", "equation": "g1 -->"}])
+ b = _model("B",
+ [cobra.Metabolite("g2", name="Glucose", formula="C6H12O6", charge=-1, compartment="c")],
+ [{"id": "EX_B", "equation": "g2 -->"}])
+ with pytest.warns(UserWarning, match="different charges"):
+ merge_models([a, b])
diff --git a/tests/test_manipulation_remove.py b/tests/test_manipulation_remove.py
new file mode 100644
index 0000000..2b659b9
--- /dev/null
+++ b/tests/test_manipulation_remove.py
@@ -0,0 +1,97 @@
+"""Tests for raven_python.manipulation.remove (removeMets/removeGenes ports)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import (
+ add_reactions_from_equations,
+ remove_genes,
+ remove_metabolites,
+)
+
+
+@pytest.fixture
+def model():
+ m = cobra.Model("t")
+ m.add_metabolites(
+ [
+ cobra.Metabolite("atp_c", name="ATP", compartment="c"),
+ cobra.Metabolite("atp_m", name="ATP", compartment="m"),
+ cobra.Metabolite("adp_c", name="ADP", compartment="c"),
+ cobra.Metabolite("x_c", name="X", compartment="c"),
+ ]
+ )
+ add_reactions_from_equations(
+ m,
+ [
+ {"id": "R1", "equation": "atp_c --> adp_c", "gene_reaction_rule": "G1 and G2"},
+ {"id": "R2", "equation": "atp_c --> x_c", "gene_reaction_rule": "G3 or G4"},
+ {"id": "R3", "equation": "atp_m --> adp_c"}, # no GPR (spontaneous)
+ ],
+ )
+ return m
+
+
+# --- remove_metabolites ----------------------------------------------------
+
+def test_remove_metabolites_by_id(model):
+ remove_metabolites(model, ["x_c"])
+ assert "x_c" not in model.metabolites
+ # reaction kept, just lost the metabolite
+ assert "R2" in model.reactions
+
+
+def test_remove_metabolites_by_name_across_compartments(model):
+ # "ATP" exists in c and m; by_name removes both at once.
+ remove_metabolites(model, ["ATP"], by_name=True)
+ assert "atp_c" not in model.metabolites
+ assert "atp_m" not in model.metabolites
+ assert "adp_c" in model.metabolites
+
+
+def test_remove_metabolites_destructive(model):
+ remove_metabolites(model, ["adp_c"], destructive=True)
+ # R1 and R3 both produced adp_c -> removed
+ assert "adp_c" not in model.metabolites
+ assert "R1" not in model.reactions and "R3" not in model.reactions
+
+
+# --- remove_genes ----------------------------------------------------------
+
+def test_remove_genes_remove_mode(model):
+ blocked = remove_genes(model, ["G1"], blocked_reactions="remove")
+ # R1 = "G1 and G2": removing G1 breaks the complex -> blocked -> removed
+ assert blocked == ["R1"]
+ assert "R1" not in model.reactions
+ assert "R2" in model.reactions # OR rule unaffected
+
+
+def test_remove_genes_constrain_mode(model):
+ blocked = remove_genes(model, ["G1"], blocked_reactions="constrain")
+ assert blocked == ["R1"]
+ r1 = model.reactions.get_by_id("R1")
+ assert r1.bounds == (0, 0) # kept but constrained, per RAVEN default
+ assert r1.gene_reaction_rule == ""
+
+
+def test_remove_genes_keep_mode(model):
+ blocked = remove_genes(model, ["G1"], blocked_reactions="keep")
+ assert blocked == ["R1"]
+ r1 = model.reactions.get_by_id("R1")
+ assert r1.gene_reaction_rule == ""
+ assert r1.bounds != (0, 0) # left untouched
+
+
+def test_remove_genes_or_rule_not_blocked(model):
+ blocked = remove_genes(model, ["G3"], blocked_reactions="remove")
+ # R2 = "G3 or G4": removing G3 leaves G4 -> not blocked
+ assert blocked == []
+ assert model.reactions.get_by_id("R2").gene_reaction_rule == "G4"
+
+
+def test_remove_genes_absent_gene_is_noop(model):
+ assert remove_genes(model, ["NOPE"]) == []
+
+
+def test_remove_genes_bad_policy(model):
+ with pytest.raises(ValueError, match="blocked_reactions"):
+ remove_genes(model, ["G1"], blocked_reactions="explode")
diff --git a/tests/test_manipulation_simplify.py b/tests/test_manipulation_simplify.py
new file mode 100644
index 0000000..586a0c3
--- /dev/null
+++ b/tests/test_manipulation_simplify.py
@@ -0,0 +1,184 @@
+"""Tests for simplifyModel reduction modes."""
+import cobra
+import pytest
+
+from raven_python.manipulation import (
+ add_reactions_from_equations,
+ constrain_reversible_reactions,
+ group_linear_reactions,
+ remove_dead_end_reactions,
+ remove_duplicate_reactions,
+)
+
+# --- remove_dead_end_reactions --------------------------------------------
+
+def test_dead_end_removed():
+ m = cobra.Model("t")
+ m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b", "dead")])
+ add_reactions_from_equations(
+ m,
+ [
+ {"id": "R_in", "equation": " --> a"},
+ {"id": "R1", "equation": "a --> b"},
+ {"id": "R_out", "equation": "b --> "},
+ {"id": "R_dead", "equation": "a --> dead"}, # 'dead' only produced
+ ],
+ )
+ removed_rxns, removed_mets = remove_dead_end_reactions(m)
+ assert "R_dead" in removed_rxns
+ assert "dead" in removed_mets
+ # the productive path survives
+ assert {"R_in", "R1", "R_out"} <= {r.id for r in m.reactions}
+
+
+def test_dead_end_respects_reserved():
+ m = cobra.Model("t")
+ m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "dead")])
+ add_reactions_from_equations(
+ m, [{"id": "R_in", "equation": " --> a"}, {"id": "R_dead", "equation": "a --> dead"}]
+ )
+ removed_rxns, _ = remove_dead_end_reactions(m, reserved=["R_dead"])
+ assert "R_dead" not in removed_rxns
+ assert "R_dead" in {r.id for r in m.reactions}
+
+
+# --- remove_duplicate_reactions -------------------------------------------
+
+def test_duplicates_removed():
+ m = cobra.Model("t")
+ m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b")])
+ add_reactions_from_equations(
+ m,
+ [
+ {"id": "R1", "equation": "a --> b", "bounds": (0, 1000)},
+ {"id": "R2", "equation": "a --> b", "bounds": (0, 1000)}, # duplicate of R1
+ {"id": "R3", "equation": "a --> b", "bounds": (0, 500)}, # different bounds
+ ],
+ )
+ removed = remove_duplicate_reactions(m)
+ assert len(removed) == 1 # one of R1/R2 removed
+ assert {"R3"} <= {r.id for r in m.reactions}
+ assert sum(r.id in ("R1", "R2") for r in m.reactions) == 1
+
+
+def test_duplicates_keep_reserved():
+ m = cobra.Model("t")
+ m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b")])
+ add_reactions_from_equations(
+ m,
+ [
+ {"id": "R1", "equation": "a --> b", "bounds": (0, 1000)},
+ {"id": "R2", "equation": "a --> b", "bounds": (0, 1000)},
+ ],
+ )
+ remove_duplicate_reactions(m, reserved=["R1"])
+ assert "R1" in {r.id for r in m.reactions} # reserved one kept
+
+
+# --- constrain_reversible_reactions ---------------------------------------
+
+def test_forward_only_reversible_constrained():
+ m = cobra.Model("t")
+ m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b")])
+ add_reactions_from_equations(
+ m,
+ [
+ {"id": "R_in", "equation": " --> a", "bounds": (0, 1000)},
+ {"id": "R1", "equation": "a <=> b", "bounds": (-1000, 1000)}, # can only go fwd
+ {"id": "R_out", "equation": "b --> ", "bounds": (0, 1000)},
+ ],
+ )
+ changed = constrain_reversible_reactions(m)
+ assert "R1" in changed
+ assert m.reactions.get_by_id("R1").lower_bound == 0 # constrained to forward
+
+
+def test_truly_reversible_unchanged():
+ m = cobra.Model("t")
+ m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b")])
+ add_reactions_from_equations(
+ m,
+ [
+ {"id": "R_in", "equation": " <=> a", "bounds": (-1000, 1000)},
+ {"id": "R1", "equation": "a <=> b", "bounds": (-1000, 1000)},
+ {"id": "R_out", "equation": "b <=> ", "bounds": (-1000, 1000)},
+ ],
+ )
+ changed = constrain_reversible_reactions(m)
+ assert "R1" not in changed # can go both ways
+
+
+# --- group_linear_reactions -----------------------------------------------
+
+def test_linear_chain_merged():
+ m = cobra.Model("t")
+ m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b", "c")])
+ add_reactions_from_equations(
+ m,
+ [
+ {"id": "R1", "equation": "a --> b"}, # b: single producer
+ {"id": "R2", "equation": "b --> c"}, # b: single consumer
+ ],
+ )
+ n_before = len(m.reactions)
+ group_linear_reactions(m)
+ # b is eliminated; R1+R2 merged into one reaction a --> c
+ assert "b" not in m.metabolites
+ assert len(m.reactions) < n_before
+
+
+def test_group_linear_discards_genes():
+ m = cobra.Model("t")
+ m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b", "c")])
+ add_reactions_from_equations(
+ m,
+ [
+ {"id": "R1", "equation": "a --> b", "gene_reaction_rule": "G1"},
+ {"id": "R2", "equation": "b --> c", "gene_reaction_rule": "G2"},
+ ],
+ )
+ group_linear_reactions(m)
+ assert len(m.genes) == 0
+
+
+# --- regression: incremental merge collapses a long chain (known_issues.md D1) ---
+
+def test_group_linear_merges_long_chain_in_one_pass():
+ """The incremental scan still flattens a 5-reaction linear chain — the
+ correctness property the original O(n²·m) restart-after-merge loop had."""
+ m = cobra.Model("t")
+ m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in "abcdef"])
+ add_reactions_from_equations(
+ m,
+ [
+ {"id": "R_in", "equation": " --> a"},
+ {"id": "R1", "equation": "a --> b"},
+ {"id": "R2", "equation": "b --> c"},
+ {"id": "R3", "equation": "c --> d"},
+ {"id": "R4", "equation": "d --> e"},
+ {"id": "R5", "equation": "e --> f"},
+ {"id": "R_out", "equation": "f --> "},
+ ],
+ )
+ group_linear_reactions(m)
+ # All the chain's internal metabolites are gone.
+ assert {x for x in m.metabolites if x.id in {"b", "c", "d", "e"}} == set()
+
+
+# --- regression: NaN FVA on infeasible model (known_issues.md C1) ----------
+
+def test_constrain_reversible_raises_on_infeasible():
+ """An infeasible model produces NaN FVA ranges; the old abs(NaN) < eps
+ check silently treated those as 'truly reversible'. Now raises."""
+ m = cobra.Model("t")
+ a, b = (cobra.Metabolite(x, compartment="c") for x in ("a", "b"))
+ m.add_metabolites([a, b])
+ # Force a contradiction: r requires production AND consumption of a, but
+ # nothing else produces a.
+ r = cobra.Reaction("r", lower_bound=-1, upper_bound=1)
+ r.add_metabolites({a: -1, b: 1})
+ forced = cobra.Reaction("forced", lower_bound=5, upper_bound=10) # infeasible
+ forced.add_metabolites({a: -1})
+ m.add_reactions([r, forced])
+ with pytest.raises(RuntimeError, match="infeasible"):
+ constrain_reversible_reactions(m)
diff --git a/tests/test_manipulation_transfer.py b/tests/test_manipulation_transfer.py
new file mode 100644
index 0000000..61c2ac9
--- /dev/null
+++ b/tests/test_manipulation_transfer.py
@@ -0,0 +1,137 @@
+"""Tests for add_reactions_from_model (addRxnsGenesMets port)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import add_reactions_from_equations, add_reactions_from_model
+
+
+@pytest.fixture
+def draft():
+ m = cobra.Model("draft")
+ m.add_metabolites(
+ [cobra.Metabolite("glc_c", name="Glucose", formula="C6H12O6", compartment="c")]
+ )
+ # an existing reaction so glc_c is in use and we have an id to test skipping
+ add_reactions_from_equations(m, [{"id": "R_existing", "equation": "glc_c <=>"}])
+ return m
+
+
+@pytest.fixture
+def source():
+ m = cobra.Model("source")
+ m.add_metabolites(
+ [
+ # same name[comp] as draft's glc_c but a DIFFERENT id
+ cobra.Metabolite("glucose_c", name="Glucose", formula="C6H12O6", compartment="c"),
+ cobra.Metabolite("atp_c", name="ATP", formula="C10H16N5O13P3", charge=-4, compartment="c"),
+ cobra.Metabolite("g6p_c", name="G6P", formula="C6H13O9P", compartment="c"),
+ ]
+ )
+ add_reactions_from_equations(
+ m,
+ [
+ {
+ "id": "HEX",
+ "equation": "glucose_c + atp_c --> g6p_c",
+ "name": "hexokinase",
+ "bounds": (0, 1000),
+ "gene_reaction_rule": "G1",
+ "subsystem": "glycolysis",
+ },
+ {"id": "R_existing", "equation": "glucose_c <=>"}, # id already in draft
+ ],
+ )
+ return m
+
+
+def test_metabolite_matched_by_name_comp_not_id(draft, source):
+ add_reactions_from_model(draft, source, "HEX")
+ hex_rxn = draft.reactions.get_by_id("HEX")
+ # Glucose reused from the draft (id glc_c), NOT the source's glucose_c
+ assert "glc_c" in {m.id for m in hex_rxn.metabolites}
+ assert "glucose_c" not in draft.metabolites
+
+
+def test_new_metabolites_added_with_metadata(draft, source):
+ add_reactions_from_model(draft, source, "HEX")
+ assert "atp_c" in draft.metabolites and "g6p_c" in draft.metabolites
+ assert draft.metabolites.get_by_id("g6p_c").formula == "C6H13O9P"
+ assert draft.metabolites.get_by_id("atp_c").charge == -4
+
+
+def test_reaction_copied_with_bounds_and_name(draft, source):
+ (rxn,) = add_reactions_from_model(draft, source, "HEX")
+ assert rxn.id == "HEX"
+ assert rxn.name == "hexokinase"
+ assert rxn.bounds == (0, 1000)
+ assert rxn.subsystem == "glycolysis"
+ assert {m.id: rxn.get_coefficient(m.id) for m in rxn.metabolites} == {
+ "glc_c": -1.0,
+ "atp_c": -1.0,
+ "g6p_c": 1.0,
+ }
+
+
+def test_genes_true_copies_gpr_and_creates_genes(draft, source):
+ add_reactions_from_model(draft, source, "HEX", genes=True)
+ assert draft.reactions.get_by_id("HEX").gene_reaction_rule == "G1"
+ assert "G1" in draft.genes
+
+
+def test_genes_false_no_gpr(draft, source):
+ add_reactions_from_model(draft, source, "HEX", genes=False)
+ assert draft.reactions.get_by_id("HEX").gene_reaction_rule == ""
+
+
+def test_genes_string_override(draft, source):
+ add_reactions_from_model(draft, source, "HEX", genes="G9 or G10")
+ assert draft.reactions.get_by_id("HEX").gene_reaction_rule == "G9 or G10"
+
+
+def test_skips_already_present(draft, source):
+ added = add_reactions_from_model(draft, source, ["HEX", "R_existing"])
+ assert [r.id for r in added] == ["HEX"]
+
+
+def test_all_present_raises(draft, source):
+ with pytest.raises(ValueError, match="already in the model"):
+ add_reactions_from_model(draft, source, "R_existing")
+
+
+def test_unknown_source_reaction_raises(draft, source):
+ with pytest.raises(ValueError, match="not found in the source model"):
+ add_reactions_from_model(draft, source, "NOPE")
+
+
+def test_note_and_confidence_stored(draft, source):
+ (rxn,) = add_reactions_from_model(draft, source, "HEX", note="from KEGG", confidence=2)
+ assert rxn.notes["note"] == "from KEGG"
+ assert rxn.notes["confidence_score"] == 2
+
+
+# --- regression: intra-batch met-id minting collision (known_issues.md A3) ---
+
+def test_intra_batch_id_minting_unique():
+ """Two source mets whose ids both collide with the draft and whose name[comp]
+ differs both get routed through new-id minting. The fix tracks ids minted in
+ the current batch so the two don't collapse to the same generated id."""
+ draft = cobra.Model("draft")
+ draft.add_metabolites([
+ cobra.Metabolite("atp_c", name="ATP-draft", compartment="c"),
+ cobra.Metabolite("adp_c", name="ADP-draft", compartment="c"),
+ ])
+ source = cobra.Model("source")
+ source.add_metabolites([
+ cobra.Metabolite("atp_c", name="ATP-source", compartment="c"),
+ cobra.Metabolite("adp_c", name="ADP-source", compartment="c"),
+ ])
+ rxn = cobra.Reaction("R1", lower_bound=0, upper_bound=1000)
+ source.add_reactions([rxn])
+ rxn.add_metabolites({
+ source.metabolites.get_by_id("atp_c"): -1,
+ source.metabolites.get_by_id("adp_c"): 1,
+ })
+ add_reactions_from_model(draft, source, "R1")
+ # Both source mets minted distinct ids (m1 and m2) — not a collision.
+ new_ids = sorted(m.id for m in draft.metabolites if m.id not in ("atp_c", "adp_c"))
+ assert len(new_ids) == 2 and len(set(new_ids)) == 2
diff --git a/tests/test_manipulation_transport.py b/tests/test_manipulation_transport.py
new file mode 100644
index 0000000..e8fb2b6
--- /dev/null
+++ b/tests/test_manipulation_transport.py
@@ -0,0 +1,98 @@
+"""Tests for add_transport_reactions (addTransport port)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import add_transport_reactions
+
+
+@pytest.fixture
+def model():
+ m = cobra.Model("t")
+ m.compartments = {"c": "cytoplasm", "m": "mitochondrion", "e": "extracellular"}
+ m.add_metabolites(
+ [
+ cobra.Metabolite("atp_c", name="ATP", formula="C10H16N5O13P3", charge=-4, compartment="c"),
+ cobra.Metabolite("h2o_c", name="H2O", formula="H2O", compartment="c"),
+ cobra.Metabolite("atp_m", name="ATP", compartment="m"), # exists in m
+ ]
+ )
+ return m
+
+
+def test_basic_transport_to_existing(model):
+ added = add_transport_reactions(model, "c", "m", ["ATP"])
+ assert len(added) == 1
+ rxn = added[0]
+ assert rxn.id == "tr_0001"
+ assert rxn.name == "ATP transport, cytoplasm-mitochondrion"
+ assert {m.id: rxn.get_coefficient(m.id) for m in rxn.metabolites} == {
+ "atp_c": -1.0,
+ "atp_m": 1.0,
+ }
+ assert rxn.reversibility is True
+
+
+def test_only_to_existing_skips_missing(model):
+ # H2O is not in m; with only_to_existing (default) it's skipped
+ added = add_transport_reactions(model, "c", "m", ["ATP", "H2O"])
+ assert [r.id for r in added] == ["tr_0001"] # only ATP
+
+
+def test_creates_missing_target_metabolite(model):
+ added = add_transport_reactions(
+ model, "c", "m", ["H2O"], only_to_existing=False
+ )
+ assert len(added) == 1
+ new = [mt for mt in model.metabolites if mt.name == "H2O" and mt.compartment == "m"]
+ assert len(new) == 1
+ assert new[0].formula == "H2O" # copied from source
+
+
+def test_copies_formula_and_charge(model):
+ add_transport_reactions(model, "c", "e", ["ATP"], only_to_existing=False)
+ new = [mt for mt in model.metabolites if mt.name == "ATP" and mt.compartment == "e"][0]
+ assert new.formula == "C10H16N5O13P3"
+ assert new.charge == -4
+
+
+def test_irreversible(model):
+ (rxn,) = add_transport_reactions(model, "c", "m", ["ATP"], reversible=False)
+ assert rxn.lower_bound == 0
+ assert rxn.reversibility is False
+
+
+def test_default_all_metabolites_in_from(model):
+ # default metabolite_names = all in c (ATP, H2O); to m, only_to_existing -> only ATP
+ added = add_transport_reactions(model, "c", "m")
+ assert [r.id for r in added] == ["tr_0001"]
+
+
+def test_multiple_target_compartments_and_sequential_ids(model):
+ added = add_transport_reactions(
+ model, "c", ["m", "e"], ["ATP"], only_to_existing=False
+ )
+ assert [r.id for r in added] == ["tr_0001", "tr_0002"]
+
+
+def test_unknown_compartment_raises(model):
+ with pytest.raises(ValueError, match="not in the model"):
+ add_transport_reactions(model, "x", "m", ["ATP"])
+
+
+def test_unknown_metabolite_raises(model):
+ with pytest.raises(ValueError, match="not found in compartment"):
+ add_transport_reactions(model, "c", "m", ["NOPE"])
+
+
+# --- regression: duplicate name in compartment (known_issues.md A4) --------
+
+def test_duplicate_name_in_source_compartment_warns(model):
+ """Two source mets sharing a name in the same compartment warn instead of
+ silently collapsing — previously one was dropped from the lookup dict."""
+ model.add_metabolites([
+ cobra.Metabolite("h2o2_c", name="H2O", compartment="c"), # duplicate name
+ ])
+ with pytest.warns(UserWarning, match="Multiple metabolites named 'H2O'"):
+ added = add_transport_reactions(model, "c", "m", ["H2O"], only_to_existing=False)
+ # Transport still works (uses the first match) — the warning is the signal.
+ assert len(added) == 1
diff --git a/tests/test_parameters.py b/tests/test_parameters.py
new file mode 100644
index 0000000..c0ab06c
--- /dev/null
+++ b/tests/test_parameters.py
@@ -0,0 +1,60 @@
+"""Tests for set_variance_bounds (the var mode of setParam)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import add_reactions_from_equations, set_variance_bounds
+
+
+@pytest.fixture
+def model():
+ m = cobra.Model("t")
+ m.add_metabolites(
+ [cobra.Metabolite("a_c", compartment="c"), cobra.Metabolite("b_c", compartment="c")]
+ )
+ add_reactions_from_equations(
+ m,
+ [
+ {"id": "R1", "equation": "a_c <=> b_c"},
+ {"id": "R2", "equation": "a_c <=> b_c"},
+ ],
+ )
+ return m
+
+
+def test_band_positive(model):
+ set_variance_bounds(model, "R1", 100, 5) # 97.5 .. 102.5
+ lb, ub = model.reactions.get_by_id("R1").bounds
+ assert lb == pytest.approx(97.5)
+ assert ub == pytest.approx(102.5)
+
+
+def test_band_negative_is_ordered(model):
+ set_variance_bounds(model, "R1", -100, 5)
+ lb, ub = model.reactions.get_by_id("R1").bounds
+ assert lb == pytest.approx(-102.5)
+ assert ub == pytest.approx(-97.5)
+ assert lb <= ub
+
+
+def test_broadcast_scalar(model):
+ set_variance_bounds(model, ["R1", "R2"], 50, 10)
+ for rid in ("R1", "R2"):
+ lb, ub = model.reactions.get_by_id(rid).bounds
+ assert lb == pytest.approx(47.5)
+ assert ub == pytest.approx(52.5)
+
+
+def test_per_reaction_values(model):
+ set_variance_bounds(model, ["R1", "R2"], [100, 200], 0)
+ assert model.reactions.get_by_id("R1").bounds == pytest.approx((100, 100))
+ assert model.reactions.get_by_id("R2").bounds == pytest.approx((200, 200))
+
+
+def test_length_mismatch_raises(model):
+ with pytest.raises(ValueError, match="to match the reactions"):
+ set_variance_bounds(model, ["R1", "R2"], [1, 2, 3], 5)
+
+
+def test_unknown_reaction_raises(model):
+ with pytest.raises(ValueError, match="not found"):
+ set_variance_bounds(model, "NOPE", 1, 5)
diff --git a/tests/test_scripts_registry.py b/tests/test_scripts_registry.py
new file mode 100644
index 0000000..c9c03cf
--- /dev/null
+++ b/tests/test_scripts_registry.py
@@ -0,0 +1,58 @@
+"""Tests for scripts/make_registry_snippet.py registry-entry helpers."""
+import hashlib
+import importlib.util
+import json
+from pathlib import Path
+
+import pytest
+
+# scripts/ is not a package; load the module directly by path.
+_SCRIPT = Path(__file__).resolve().parents[1] / "scripts" / "make_registry_snippet.py"
+_spec = importlib.util.spec_from_file_location("make_registry_snippet", _SCRIPT)
+mrs = importlib.util.module_from_spec(_spec)
+_spec.loader.exec_module(mrs)
+
+
+def _sha(data: bytes) -> str:
+ return hashlib.sha256(data).hexdigest()
+
+
+def test_data_entry_lists_files_with_urls_and_checksums(tmp_path):
+ (tmp_path / "reference_model.yml.gz").write_bytes(b"model")
+ (tmp_path / "ko_reaction.tsv.gz").write_bytes(b"table")
+ (tmp_path / ".hidden").write_bytes(b"skip") # hidden files ignored
+
+ entry = mrs.data_entry("kegg", "kegg116", "https://x/rel/", tmp_path)
+ assert entry["version"] == "kegg116"
+ assert set(entry["files"]) == {"reference_model.yml.gz", "ko_reaction.tsv.gz"}
+ ref = entry["files"]["reference_model.yml.gz"]
+ assert ref["url"] == "https://x/rel/reference_model.yml.gz" # trailing slash collapsed
+ assert ref["sha256"] == _sha(b"model")
+
+
+def test_data_entry_empty_dir_errors(tmp_path):
+ with pytest.raises(SystemExit):
+ mrs.data_entry("kegg", "v1", "https://x", tmp_path)
+
+
+def test_binary_entry_parses_platform_from_filename(tmp_path):
+ (tmp_path / "blast-2.16.0-linux-x86_64.zip").write_bytes(b"linux")
+ (tmp_path / "blast-2.16.0-macos-arm64.zip").write_bytes(b"mac")
+ (tmp_path / "other-1.0-linux-x86_64.zip").write_bytes(b"nope") # different bundle
+
+ entry = mrs.binary_entry("blast", "2.16.0", ["blastp", "makeblastdb"], "https://x", tmp_path)
+ assert entry["provides"] == ["blastp", "makeblastdb"]
+ assert set(entry["platforms"]) == {"linux-x86_64", "macos-arm64"}
+ assert entry["platforms"]["macos-arm64"]["sha256"] == _sha(b"mac")
+ assert entry["platforms"]["linux-x86_64"]["url"].endswith("blast-2.16.0-linux-x86_64.zip")
+
+
+def test_binary_entry_no_zips_errors(tmp_path):
+ with pytest.raises(SystemExit):
+ mrs.binary_entry("blast", "2.16.0", ["blastp"], "https://x", tmp_path)
+
+
+def test_render_is_valid_json_round_trip():
+ entry = {"version": "v1", "files": {"a": {"url": "u", "sha256": "s"}}}
+ text = mrs.render("kegg", entry)
+ assert json.loads(text) == {"kegg": entry}
diff --git a/tests/test_utils_balance.py b/tests/test_utils_balance.py
new file mode 100644
index 0000000..aa1e47e
--- /dev/null
+++ b/tests/test_utils_balance.py
@@ -0,0 +1,76 @@
+"""Tests for get_elemental_balance (getElementalBalance port)."""
+import cobra
+import pytest
+
+from raven_python.utils import ElementalBalance, get_elemental_balance
+
+
+@pytest.fixture
+def model():
+ m = cobra.Model("t")
+ m.add_metabolites(
+ [
+ cobra.Metabolite("a_c", formula="C6H12O6", charge=0, compartment="c"),
+ cobra.Metabolite("b_c", formula="C6H12O6", charge=0, compartment="c"),
+ cobra.Metabolite("c_c", formula="C3H6O3", charge=0, compartment="c"),
+ cobra.Metabolite("n_c", compartment="c"), # no formula
+ ]
+ )
+ r_bal = cobra.Reaction("R_bal")
+ m.add_reactions([r_bal])
+ r_bal.build_reaction_from_string("a_c --> b_c") # C6H12O6 -> C6H12O6
+ r_unbal = cobra.Reaction("R_unbal")
+ m.add_reactions([r_unbal])
+ r_unbal.build_reaction_from_string("a_c --> c_c") # C6H12O6 -> C3H6O3
+ r_unknown = cobra.Reaction("R_unknown")
+ m.add_reactions([r_unknown])
+ r_unknown.build_reaction_from_string("a_c --> n_c") # n_c has no formula
+ return m
+
+
+def test_balanced(model):
+ (res,) = get_elemental_balance(model, ["R_bal"])
+ assert res == ElementalBalance("R_bal", "balanced", {})
+
+
+def test_unbalanced_reports_imbalance(model):
+ (res,) = get_elemental_balance(model, ["R_unbal"])
+ assert res.status == "unbalanced"
+ # products - reactants: C3H6O3 - C6H12O6 = -C3H6O3
+ assert res.imbalance == {"C": -3.0, "H": -6.0, "O": -3.0}
+
+
+def test_missing_formula_is_unknown_not_silently_wrong(model):
+ # cobra's check_mass_balance alone would silently report an imbalance here;
+ # we flag it as unknown instead.
+ (res,) = get_elemental_balance(model, ["R_unknown"])
+ assert res.status == "unknown"
+ assert res.imbalance == {}
+
+
+def test_all_reactions_default(model):
+ results = get_elemental_balance(model)
+ assert {r.reaction_id: r.status for r in results} == {
+ "R_bal": "balanced",
+ "R_unbal": "unbalanced",
+ "R_unknown": "unknown",
+ }
+
+
+def test_charge_excluded(model):
+ # give a charge imbalance but keep elements balanced -> still "balanced"
+ model.metabolites.get_by_id("b_c").charge = 1
+ (res,) = get_elemental_balance(model, ["R_bal"])
+ assert res.status == "balanced"
+
+
+# --- regression: empty reaction → unknown (known_issues.md F5) -------------
+
+def test_empty_reaction_is_unknown(model):
+ """A reaction with no metabolites used to be reported `balanced`
+ vacuously (any() over an empty list is False and check_mass_balance
+ returns no imbalance). Now reports `unknown`."""
+ empty = cobra.Reaction("R_empty", lower_bound=0, upper_bound=1000)
+ model.add_reactions([empty])
+ (res,) = get_elemental_balance(model, ["R_empty"])
+ assert res.status == "unknown"
diff --git a/tests/test_utils_gpr.py b/tests/test_utils_gpr.py
new file mode 100644
index 0000000..275d020
--- /dev/null
+++ b/tests/test_utils_gpr.py
@@ -0,0 +1,84 @@
+"""Tests for raven_python.utils.gpr (GPR linting)."""
+import cobra
+import pytest
+
+from raven_python.utils import GPRIssue, find_non_dnf_grrules, is_dnf
+
+
+@pytest.mark.parametrize(
+ "rule",
+ [
+ "",
+ "G1",
+ "G1 and G2",
+ "G1 or G2",
+ "G1 and G2 and G3",
+ "G1 or G2 or G3",
+ "(G1 and G2) or G3",
+ "(G1 and G2) or (G3 and G4)",
+ "G1 or (G2 and G3)",
+ ],
+)
+def test_is_dnf_true(rule):
+ assert is_dnf(rule) is True
+
+
+@pytest.mark.parametrize(
+ "rule",
+ [
+ "(G1 or G2) and G3",
+ "G1 and (G2 or G3)",
+ "(G1 or G2) and (G3 or G4)",
+ "G1 and (G2 or (G3 and G4))",
+ ],
+)
+def test_is_dnf_false(rule):
+ assert is_dnf(rule) is False
+
+
+def test_is_dnf_accepts_gpr_and_none():
+ from cobra.core.gene import GPR
+
+ assert is_dnf(GPR.from_string("(G1 or G2) and G3")) is False
+ assert is_dnf(GPR.from_string("G1 or G2")) is True
+ assert is_dnf(None) is True
+
+
+def test_is_dnf_independent_of_formatting():
+ # cobra normalises on assignment, so casing/whitespace cannot change the verdict.
+ assert is_dnf("(G1 OR G2) AND G3") is False
+ assert is_dnf("( G1 and G2 ) or G3") is True
+
+
+def _model_with_rules(rules: dict[str, str]) -> cobra.Model:
+ model = cobra.Model("t")
+ model.add_reactions([cobra.Reaction(rid) for rid in rules])
+ for rid, rule in rules.items():
+ model.reactions.get_by_id(rid).gene_reaction_rule = rule
+ return model
+
+
+def test_find_non_dnf_grrules_flags_only_offenders():
+ model = _model_with_rules(
+ {
+ "R_ok_single": "G1",
+ "R_ok_complex": "G1 and G2",
+ "R_ok_dnf": "(G1 and G2) or G3",
+ "R_no_gpr": "",
+ "R_bad_1": "(G1 or G2) and G3",
+ "R_bad_2": "(G1 or G2) and (G3 or G4)",
+ }
+ )
+
+ issues = find_non_dnf_grrules(model)
+
+ assert [i.reaction_id for i in issues] == ["R_bad_1", "R_bad_2"]
+ assert all(isinstance(i, GPRIssue) for i in issues)
+ assert all("disjunctive normal form" in i.reason for i in issues)
+ # the reported GPR is the cobra-normalised string
+ assert issues[0].gpr == "(G1 or G2) and G3"
+
+
+def test_find_non_dnf_grrules_empty_when_all_clean():
+ model = _model_with_rules({"R1": "G1 or G2", "R2": "(G1 and G2) or G3"})
+ assert find_non_dnf_grrules(model) == []
diff --git a/tests/test_utils_sort.py b/tests/test_utils_sort.py
new file mode 100644
index 0000000..18bca24
--- /dev/null
+++ b/tests/test_utils_sort.py
@@ -0,0 +1,42 @@
+"""Tests for sort_identifiers and write_yaml_model(sort_ids=True)."""
+import cobra
+
+from raven_python.io import read_yaml_model, write_yaml_model
+from raven_python.manipulation import add_reactions_from_equations
+from raven_python.utils import sort_identifiers
+
+
+def _model():
+ m = cobra.Model("t")
+ m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("b_c", "a_c")])
+ add_reactions_from_equations(
+ m,
+ [
+ {"id": "R2", "equation": "a_c --> b_c", "gene_reaction_rule": "GB"},
+ {"id": "R1", "equation": "b_c --> a_c", "gene_reaction_rule": "GA"},
+ ],
+ )
+ return m
+
+
+def test_sort_identifiers_orders_everything():
+ m = _model()
+ sort_identifiers(m)
+ assert [r.id for r in m.reactions] == ["R1", "R2"]
+ assert [x.id for x in m.metabolites] == ["a_c", "b_c"]
+ assert [g.id for g in m.genes] == ["GA", "GB"]
+ # lookup index still intact after sorting
+ assert m.reactions.get_by_id("R2").id == "R2"
+
+
+def test_write_yaml_sort_ids_does_not_mutate(tmp_path):
+ m = _model()
+ order_before = [r.id for r in m.reactions]
+ out = tmp_path / "m.yml"
+ write_yaml_model(m, out, sort_ids=True)
+ assert [r.id for r in m.reactions] == order_before # model untouched
+ # but the file is sorted
+ text = out.read_text()
+ assert text.index("R1") < text.index("R2")
+ reloaded = read_yaml_model(out)
+ assert [r.id for r in reloaded.reactions] == ["R1", "R2"]
diff --git a/tests/test_utils_validate.py b/tests/test_utils_validate.py
new file mode 100644
index 0000000..2d38e6f
--- /dev/null
+++ b/tests/test_utils_validate.py
@@ -0,0 +1,80 @@
+"""Tests for check_model (the surviving checks of checkModelStruct)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import add_reactions_from_equations
+from raven_python.utils import ModelIssue, check_model
+
+
+def _categories(issues, category):
+ return [i.object_id for i in issues if i.category == category]
+
+
+@pytest.fixture
+def model():
+ m = cobra.Model("t")
+ m.add_metabolites(
+ [
+ cobra.Metabolite("a_c", name="A", compartment="c"),
+ cobra.Metabolite("b_c", name="B", compartment="c"),
+ ]
+ )
+ add_reactions_from_equations(
+ m, [{"id": "R1", "equation": "a_c --> b_c", "gene_reaction_rule": "G1"}]
+ )
+ m.reactions.get_by_id("R1").objective_coefficient = 1
+ return m
+
+
+def test_clean_model_has_no_issues(model):
+ assert check_model(model) == []
+
+
+def test_orphan_metabolite(model):
+ model.add_metabolites([cobra.Metabolite("orphan_c", name="Orphan", compartment="c")])
+ assert "orphan_c" in _categories(check_model(model), "orphan_metabolite")
+
+
+def test_orphan_gene(model):
+ model.genes.append(cobra.core.gene.Gene("G_lonely"))
+ assert "G_lonely" in _categories(check_model(model), "orphan_gene")
+
+
+def test_empty_reaction(model):
+ model.add_reactions([cobra.Reaction("R_empty")])
+ assert "R_empty" in _categories(check_model(model), "empty_reaction")
+
+
+def test_empty_metabolite_name(model):
+ model.add_metabolites([cobra.Metabolite("noname_c", compartment="c")])
+ # also an orphan, but we check the name category specifically
+ assert "noname_c" in _categories(check_model(model), "empty_metabolite_name")
+
+
+def test_duplicate_name_compartment(model):
+ # second metabolite named "A" in compartment c
+ dup = cobra.Metabolite("a2_c", name="A", compartment="c")
+ model.add_metabolites([dup])
+ model.reactions.get_by_id("R1").add_metabolites({dup: -1}) # keep it used
+ issues = [i for i in check_model(model) if i.category == "duplicate_name_compartment"]
+ assert len(issues) == 1
+ assert "a_c" in issues[0].message and "a2_c" in issues[0].message
+
+
+def test_no_objective(model):
+ model.reactions.get_by_id("R1").objective_coefficient = 0
+ cats = [i.category for i in check_model(model)]
+ assert "objective" in cats
+
+
+def test_multiple_objectives(model):
+ add_reactions_from_equations(model, [{"id": "R2", "equation": "b_c --> a_c"}])
+ model.reactions.get_by_id("R2").objective_coefficient = 1
+ obj_issues = [i for i in check_model(model) if i.category == "objective"]
+ assert len(obj_issues) == 1
+ assert "Multiple" in obj_issues[0].message
+
+
+def test_returns_model_issue_instances(model):
+ model.add_reactions([cobra.Reaction("R_empty")])
+ assert all(isinstance(i, ModelIssue) for i in check_model(model))