diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..3f6d19b
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,48 @@
+name: CI
+
+on:
+ push:
+ branches: [main]
+ pull_request:
+ workflow_dispatch:
+
+# A push that obsoletes a previous run cancels it.
+concurrency:
+ group: ci-${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ lint:
+ name: ruff
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+ - uses: actions/setup-python@v5
+ with:
+ python-version: "3.12"
+ cache: pip
+ cache-dependency-path: pyproject.toml
+ - run: pip install --upgrade pip
+ - run: pip install ruff
+ - run: ruff check .
+
+ test:
+ name: pytest (py${{ matrix.python }})
+ runs-on: ubuntu-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ python: ["3.11", "3.12", "3.13"]
+ steps:
+ - uses: actions/checkout@v4
+ - uses: actions/setup-python@v5
+ with:
+ python-version: ${{ matrix.python }}
+ cache: pip
+ cache-dependency-path: pyproject.toml
+ - run: pip install --upgrade pip
+ # ``-e .[dev,plotting,excel]`` so every optional extra is exercised.
+ # Gurobi is not installable on free runners; the relevant tests
+ # skip themselves when ``optlang.gurobi_interface`` cannot import.
+ - run: pip install -e ".[dev,plotting,excel]"
+ - run: pytest -q --maxfail=5 --durations=20
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..1f44cf3
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,541 @@
+GNU GENERAL PUBLIC LICENSE
+==========================
+
+Version 3, 29 June 2007
+
+Copyright © 2007 Free Software Foundation, Inc. <>
+
+Everyone is permitted to copy and distribute verbatim copies of this license
+document, but changing it is not allowed.
+
+## Preamble
+
+The GNU General Public License is a free, copyleft license for software and other
+kinds of works.
+
+The licenses for most software and other practical works are designed to take away
+your freedom to share and change the works. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change all versions of a
+program--to make sure it remains free software for all its users. We, the Free
+Software Foundation, use the GNU General Public License for most of our software; it
+applies also to any other work released this way by its authors. You can apply it to
+your programs, too.
+
+When we speak of free software, we are referring to freedom, not price. Our General
+Public Licenses are designed to make sure that you have the freedom to distribute
+copies of free software (and charge for them if you wish), that you receive source
+code or can get it if you want it, that you can change the software or use pieces of
+it in new free programs, and that you know you can do these things.
+
+To protect your rights, we need to prevent others from denying you these rights or
+asking you to surrender the rights. Therefore, you have certain responsibilities if
+you distribute copies of the software, or if you modify it: responsibilities to
+respect the freedom of others.
+
+For example, if you distribute copies of such a program, whether gratis or for a fee,
+you must pass on to the recipients the same freedoms that you received. You must make
+sure that they, too, receive or can get the source code. And you must show them these
+terms so they know their rights.
+
+Developers that use the GNU GPL protect your rights with two steps: (1) assert
+copyright on the software, and (2) offer you this License giving you legal permission
+to copy, distribute and/or modify it.
+
+For the developers' and authors' protection, the GPL clearly explains that there is
+no warranty for this free software. For both users' and authors' sake, the GPL
+requires that modified versions be marked as changed, so that their problems will not
+be attributed erroneously to authors of previous versions.
+
+Some devices are designed to deny users access to install or run modified versions of
+the software inside them, although the manufacturer can do so. This is fundamentally
+incompatible with the aim of protecting users' freedom to change the software. The
+systematic pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable. Therefore, we have designed
+this version of the GPL to prohibit the practice for those products. If such problems
+arise substantially in other domains, we stand ready to extend this provision to
+those domains in future versions of the GPL, as needed to protect the freedom of
+users.
+
+Finally, every program is threatened constantly by software patents. States should
+not allow patents to restrict development and use of software on general-purpose
+computers, but in those that do, we wish to avoid the special danger that patents
+applied to a free program could make it effectively proprietary. To prevent this, the
+GPL assures that patents cannot be used to render the program non-free.
+
+The precise terms and conditions for copying, distribution and modification follow.
+
+## TERMS AND CONDITIONS
+
+### 0. Definitions.
+
+“This License” refers to version 3 of the GNU General Public License.
+
+“Copyright” also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+“The Program” refers to any copyrightable work licensed under this
+License. Each licensee is addressed as “you”. “Licensees” and
+“recipients” may be individuals or organizations.
+
+To “modify” a work means to copy from or adapt all or part of the work in
+a fashion requiring copyright permission, other than the making of an exact copy. The
+resulting work is called a “modified version” of the earlier work or a
+work “based on” the earlier work.
+
+A “covered work” means either the unmodified Program or a work based on
+the Program.
+
+To “propagate” a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for infringement under
+applicable copyright law, except executing it on a computer or modifying a private
+copy. Propagation includes copying, distribution (with or without modification),
+making available to the public, and in some countries other activities as well.
+
+To “convey” a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through a computer
+network, with no transfer of a copy, is not conveying.
+
+An interactive user interface displays “Appropriate Legal Notices” to the
+extent that it includes a convenient and prominently visible feature that (1)
+displays an appropriate copyright notice, and (2) tells the user that there is no
+warranty for the work (except to the extent that warranties are provided), that
+licensees may convey the work under this License, and how to view a copy of this
+License. If the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+### 1. Source Code.
+
+The “source code” for a work means the preferred form of the work for
+making modifications to it. “Object code” means any non-source form of a
+work.
+
+A “Standard Interface” means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of interfaces
+specified for a particular programming language, one that is widely used among
+developers working in that language.
+
+The “System Libraries” of an executable work include anything, other than
+the work as a whole, that (a) is included in the normal form of packaging a Major
+Component, but which is not part of that Major Component, and (b) serves only to
+enable use of the work with that Major Component, or to implement a Standard
+Interface for which an implementation is available to the public in source code form.
+A “Major Component”, in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system (if any) on which
+the executable work runs, or a compiler used to produce the work, or an object code
+interpreter used to run it.
+
+The “Corresponding Source” for a work in object code form means all the
+source code needed to generate, install, and (for an executable work) run the object
+code and to modify the work, including scripts to control those activities. However,
+it does not include the work's System Libraries, or general-purpose tools or
+generally available free programs which are used unmodified in performing those
+activities but which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for the work, and
+the source code for shared libraries and dynamically linked subprograms that the work
+is specifically designed to require, such as by intimate data communication or
+control flow between those subprograms and other parts of the work.
+
+The Corresponding Source need not include anything that users can regenerate
+automatically from other parts of the Corresponding Source.
+
+The Corresponding Source for a work in source code form is that same work.
+
+### 2. Basic Permissions.
+
+All rights granted under this License are granted for the term of copyright on the
+Program, and are irrevocable provided the stated conditions are met. This License
+explicitly affirms your unlimited permission to run the unmodified Program. The
+output from running a covered work is covered by this License only if the output,
+given its content, constitutes a covered work. This License acknowledges your rights
+of fair use or other equivalent, as provided by copyright law.
+
+You may make, run and propagate covered works that you do not convey, without
+conditions so long as your license otherwise remains in force. You may convey covered
+works to others for the sole purpose of having them make modifications exclusively
+for you, or provide you with facilities for running those works, provided that you
+comply with the terms of this License in conveying all material for which you do not
+control copyright. Those thus making or running the covered works for you must do so
+exclusively on your behalf, under your direction and control, on terms that prohibit
+them from making any copies of your copyrighted material outside their relationship
+with you.
+
+Conveying under any other circumstances is permitted solely under the conditions
+stated below. Sublicensing is not allowed; section 10 makes it unnecessary.
+
+### 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+No covered work shall be deemed part of an effective technological measure under any
+applicable law fulfilling obligations under article 11 of the WIPO copyright treaty
+adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention
+of such measures.
+
+When you convey a covered work, you waive any legal power to forbid circumvention of
+technological measures to the extent such circumvention is effected by exercising
+rights under this License with respect to the covered work, and you disclaim any
+intention to limit operation or modification of the work as a means of enforcing,
+against the work's users, your or third parties' legal rights to forbid circumvention
+of technological measures.
+
+### 4. Conveying Verbatim Copies.
+
+You may convey verbatim copies of the Program's source code as you receive it, in any
+medium, provided that you conspicuously and appropriately publish on each copy an
+appropriate copyright notice; keep intact all notices stating that this License and
+any non-permissive terms added in accord with section 7 apply to the code; keep
+intact all notices of the absence of any warranty; and give all recipients a copy of
+this License along with the Program.
+
+You may charge any price or no price for each copy that you convey, and you may offer
+support or warranty protection for a fee.
+
+### 5. Conveying Modified Source Versions.
+
+You may convey a work based on the Program, or the modifications to produce it from
+the Program, in the form of source code under the terms of section 4, provided that
+you also meet all of these conditions:
+
+* **a)** The work must carry prominent notices stating that you modified it, and giving a
+relevant date.
+* **b)** The work must carry prominent notices stating that it is released under this
+License and any conditions added under section 7. This requirement modifies the
+requirement in section 4 to “keep intact all notices”.
+* **c)** You must license the entire work, as a whole, under this License to anyone who
+comes into possession of a copy. This License will therefore apply, along with any
+applicable section 7 additional terms, to the whole of the work, and all its parts,
+regardless of how they are packaged. This License gives no permission to license the
+work in any other way, but it does not invalidate such permission if you have
+separately received it.
+* **d)** If the work has interactive user interfaces, each must display Appropriate Legal
+Notices; however, if the Program has interactive interfaces that do not display
+Appropriate Legal Notices, your work need not make them do so.
+
+A compilation of a covered work with other separate and independent works, which are
+not by their nature extensions of the covered work, and which are not combined with
+it such as to form a larger program, in or on a volume of a storage or distribution
+medium, is called an “aggregate” if the compilation and its resulting
+copyright are not used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work in an aggregate
+does not cause this License to apply to the other parts of the aggregate.
+
+### 6. Conveying Non-Source Forms.
+
+You may convey a covered work in object code form under the terms of sections 4 and
+5, provided that you also convey the machine-readable Corresponding Source under the
+terms of this License, in one of these ways:
+
+* **a)** Convey the object code in, or embodied in, a physical product (including a
+physical distribution medium), accompanied by the Corresponding Source fixed on a
+durable physical medium customarily used for software interchange.
+* **b)** Convey the object code in, or embodied in, a physical product (including a
+physical distribution medium), accompanied by a written offer, valid for at least
+three years and valid for as long as you offer spare parts or customer support for
+that product model, to give anyone who possesses the object code either (1) a copy of
+the Corresponding Source for all the software in the product that is covered by this
+License, on a durable physical medium customarily used for software interchange, for
+a price no more than your reasonable cost of physically performing this conveying of
+source, or (2) access to copy the Corresponding Source from a network server at no
+charge.
+* **c)** Convey individual copies of the object code with a copy of the written offer to
+provide the Corresponding Source. This alternative is allowed only occasionally and
+noncommercially, and only if you received the object code with such an offer, in
+accord with subsection 6b.
+* **d)** Convey the object code by offering access from a designated place (gratis or for
+a charge), and offer equivalent access to the Corresponding Source in the same way
+through the same place at no further charge. You need not require recipients to copy
+the Corresponding Source along with the object code. If the place to copy the object
+code is a network server, the Corresponding Source may be on a different server
+(operated by you or a third party) that supports equivalent copying facilities,
+provided you maintain clear directions next to the object code saying where to find
+the Corresponding Source. Regardless of what server hosts the Corresponding Source,
+you remain obligated to ensure that it is available for as long as needed to satisfy
+these requirements.
+* **e)** Convey the object code using peer-to-peer transmission, provided you inform
+other peers where the object code and Corresponding Source of the work are being
+offered to the general public at no charge under subsection 6d.
+
+A separable portion of the object code, whose source code is excluded from the
+Corresponding Source as a System Library, need not be included in conveying the
+object code work.
+
+A “User Product” is either (1) a “consumer product”, which
+means any tangible personal property which is normally used for personal, family, or
+household purposes, or (2) anything designed or sold for incorporation into a
+dwelling. In determining whether a product is a consumer product, doubtful cases
+shall be resolved in favor of coverage. For a particular product received by a
+particular user, “normally used” refers to a typical or common use of
+that class of product, regardless of the status of the particular user or of the way
+in which the particular user actually uses, or expects or is expected to use, the
+product. A product is a consumer product regardless of whether the product has
+substantial commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+“Installation Information” for a User Product means any methods,
+procedures, authorization keys, or other information required to install and execute
+modified versions of a covered work in that User Product from a modified version of
+its Corresponding Source. The information must suffice to ensure that the continued
+functioning of the modified object code is in no case prevented or interfered with
+solely because modification has been made.
+
+If you convey an object code work under this section in, or with, or specifically for
+use in, a User Product, and the conveying occurs as part of a transaction in which
+the right of possession and use of the User Product is transferred to the recipient
+in perpetuity or for a fixed term (regardless of how the transaction is
+characterized), the Corresponding Source conveyed under this section must be
+accompanied by the Installation Information. But this requirement does not apply if
+neither you nor any third party retains the ability to install modified object code
+on the User Product (for example, the work has been installed in ROM).
+
+The requirement to provide Installation Information does not include a requirement to
+continue to provide support service, warranty, or updates for a work that has been
+modified or installed by the recipient, or for the User Product in which it has been
+modified or installed. Access to a network may be denied when the modification itself
+materially and adversely affects the operation of the network or violates the rules
+and protocols for communication across the network.
+
+Corresponding Source conveyed, and Installation Information provided, in accord with
+this section must be in a format that is publicly documented (and with an
+implementation available to the public in source code form), and must require no
+special password or key for unpacking, reading or copying.
+
+### 7. Additional Terms.
+
+“Additional permissions” are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions. Additional
+permissions that are applicable to the entire Program shall be treated as though they
+were included in this License, to the extent that they are valid under applicable
+law. If additional permissions apply only to part of the Program, that part may be
+used separately under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+When you convey a copy of a covered work, you may at your option remove any
+additional permissions from that copy, or from any part of it. (Additional
+permissions may be written to require their own removal in certain cases when you
+modify the work.) You may place additional permissions on material, added by you to a
+covered work, for which you have or can give appropriate copyright permission.
+
+Notwithstanding any other provision of this License, for material you add to a
+covered work, you may (if authorized by the copyright holders of that material)
+supplement the terms of this License with terms:
+
+* **a)** Disclaiming warranty or limiting liability differently from the terms of
+sections 15 and 16 of this License; or
+* **b)** Requiring preservation of specified reasonable legal notices or author
+attributions in that material or in the Appropriate Legal Notices displayed by works
+containing it; or
+* **c)** Prohibiting misrepresentation of the origin of that material, or requiring that
+modified versions of such material be marked in reasonable ways as different from the
+original version; or
+* **d)** Limiting the use for publicity purposes of names of licensors or authors of the
+material; or
+* **e)** Declining to grant rights under trademark law for use of some trade names,
+trademarks, or service marks; or
+* **f)** Requiring indemnification of licensors and authors of that material by anyone
+who conveys the material (or modified versions of it) with contractual assumptions of
+liability to the recipient, for any liability that these contractual assumptions
+directly impose on those licensors and authors.
+
+All other non-permissive additional terms are considered “further
+restrictions” within the meaning of section 10. If the Program as you received
+it, or any part of it, contains a notice stating that it is governed by this License
+along with a term that is a further restriction, you may remove that term. If a
+license document contains a further restriction but permits relicensing or conveying
+under this License, you may add to a covered work material governed by the terms of
+that license document, provided that the further restriction does not survive such
+relicensing or conveying.
+
+If you add terms to a covered work in accord with this section, you must place, in
+the relevant source files, a statement of the additional terms that apply to those
+files, or a notice indicating where to find the applicable terms.
+
+Additional terms, permissive or non-permissive, may be stated in the form of a
+separately written license, or stated as exceptions; the above requirements apply
+either way.
+
+### 8. Termination.
+
+You may not propagate or modify a covered work except as expressly provided under
+this License. Any attempt otherwise to propagate or modify it is void, and will
+automatically terminate your rights under this License (including any patent licenses
+granted under the third paragraph of section 11).
+
+However, if you cease all violation of this License, then your license from a
+particular copyright holder is reinstated (a) provisionally, unless and until the
+copyright holder explicitly and finally terminates your license, and (b) permanently,
+if the copyright holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+Moreover, your license from a particular copyright holder is reinstated permanently
+if the copyright holder notifies you of the violation by some reasonable means, this
+is the first time you have received notice of violation of this License (for any
+work) from that copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+Termination of your rights under this section does not terminate the licenses of
+parties who have received copies or rights from you under this License. If your
+rights have been terminated and not permanently reinstated, you do not qualify to
+receive new licenses for the same material under section 10.
+
+### 9. Acceptance Not Required for Having Copies.
+
+You are not required to accept this License in order to receive or run a copy of the
+Program. Ancillary propagation of a covered work occurring solely as a consequence of
+using peer-to-peer transmission to receive a copy likewise does not require
+acceptance. However, nothing other than this License grants you permission to
+propagate or modify any covered work. These actions infringe copyright if you do not
+accept this License. Therefore, by modifying or propagating a covered work, you
+indicate your acceptance of this License to do so.
+
+### 10. Automatic Licensing of Downstream Recipients.
+
+Each time you convey a covered work, the recipient automatically receives a license
+from the original licensors, to run, modify and propagate that work, subject to this
+License. You are not responsible for enforcing compliance by third parties with this
+License.
+
+An “entity transaction” is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an organization, or
+merging organizations. If propagation of a covered work results from an entity
+transaction, each party to that transaction who receives a copy of the work also
+receives whatever licenses to the work the party's predecessor in interest had or
+could give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if the predecessor
+has it or can get it with reasonable efforts.
+
+You may not impose any further restrictions on the exercise of the rights granted or
+affirmed under this License. For example, you may not impose a license fee, royalty,
+or other charge for exercise of rights granted under this License, and you may not
+initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging
+that any patent claim is infringed by making, using, selling, offering for sale, or
+importing the Program or any portion of it.
+
+### 11. Patents.
+
+A “contributor” is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The work thus
+licensed is called the contributor's “contributor version”.
+
+A contributor's “essential patent claims” are all patent claims owned or
+controlled by the contributor, whether already acquired or hereafter acquired, that
+would be infringed by some manner, permitted by this License, of making, using, or
+selling its contributor version, but do not include claims that would be infringed
+only as a consequence of further modification of the contributor version. For
+purposes of this definition, “control” includes the right to grant patent
+sublicenses in a manner consistent with the requirements of this License.
+
+Each contributor grants you a non-exclusive, worldwide, royalty-free patent license
+under the contributor's essential patent claims, to make, use, sell, offer for sale,
+import and otherwise run, modify and propagate the contents of its contributor
+version.
+
+In the following three paragraphs, a “patent license” is any express
+agreement or commitment, however denominated, not to enforce a patent (such as an
+express permission to practice a patent or covenant not to sue for patent
+infringement). To “grant” such a patent license to a party means to make
+such an agreement or commitment not to enforce a patent against the party.
+
+If you convey a covered work, knowingly relying on a patent license, and the
+Corresponding Source of the work is not available for anyone to copy, free of charge
+and under the terms of this License, through a publicly available network server or
+other readily accessible means, then you must either (1) cause the Corresponding
+Source to be so available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner consistent with
+the requirements of this License, to extend the patent license to downstream
+recipients. “Knowingly relying” means you have actual knowledge that, but
+for the patent license, your conveying the covered work in a country, or your
+recipient's use of the covered work in a country, would infringe one or more
+identifiable patents in that country that you have reason to believe are valid.
+
+If, pursuant to or in connection with a single transaction or arrangement, you
+convey, or propagate by procuring conveyance of, a covered work, and grant a patent
+license to some of the parties receiving the covered work authorizing them to use,
+propagate, modify or convey a specific copy of the covered work, then the patent
+license you grant is automatically extended to all recipients of the covered work and
+works based on it.
+
+A patent license is “discriminatory” if it does not include within the
+scope of its coverage, prohibits the exercise of, or is conditioned on the
+non-exercise of one or more of the rights that are specifically granted under this
+License. You may not convey a covered work if you are a party to an arrangement with
+a third party that is in the business of distributing software, under which you make
+payment to the third party based on the extent of your activity of conveying the
+work, and under which the third party grants, to any of the parties who would receive
+the covered work from you, a discriminatory patent license (a) in connection with
+copies of the covered work conveyed by you (or copies made from those copies), or (b)
+primarily for and in connection with specific products or compilations that contain
+the covered work, unless you entered into that arrangement, or that patent license
+was granted, prior to 28 March 2007.
+
+Nothing in this License shall be construed as excluding or limiting any implied
+license or other defenses to infringement that may otherwise be available to you
+under applicable patent law.
+
+### 12. No Surrender of Others' Freedom.
+
+If conditions are imposed on you (whether by court order, agreement or otherwise)
+that contradict the conditions of this License, they do not excuse you from the
+conditions of this License. If you cannot convey a covered work so as to satisfy
+simultaneously your obligations under this License and any other pertinent
+obligations, then as a consequence you may not convey it at all. For example, if you
+agree to terms that obligate you to collect a royalty for further conveying from
+those to whom you convey the Program, the only way you could satisfy both those terms
+and this License would be to refrain entirely from conveying the Program.
+
+### 13. Use with the GNU Affero General Public License.
+
+Notwithstanding any other provision of this License, you have permission to link or
+combine any covered work with a work licensed under version 3 of the GNU Affero
+General Public License into a single combined work, and to convey the resulting work.
+The terms of this License will continue to apply to the part which is the covered
+work, but the special requirements of the GNU Affero General Public License, section
+13, concerning interaction through a network will apply to the combination as such.
+
+### 14. Revised Versions of this License.
+
+The Free Software Foundation may publish revised and/or new versions of the GNU
+General Public License from time to time. Such new versions will be similar in spirit
+to the present version, but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program specifies that
+a certain numbered version of the GNU General Public License “or any later
+version” applies to it, you have the option of following the terms and
+conditions either of that numbered version or of any later version published by the
+Free Software Foundation. If the Program does not specify a version number of the GNU
+General Public License, you may choose any version ever published by the Free
+Software Foundation.
+
+If the Program specifies that a proxy can decide which future versions of the GNU
+General Public License can be used, that proxy's public statement of acceptance of a
+version permanently authorizes you to choose that version for the Program.
+
+Later license versions may give you additional or different permissions. However, no
+additional obligations are imposed on any author or copyright holder as a result of
+your choosing to follow a later version.
+
+### 15. Disclaimer of Warranty.
+
+THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM “AS IS” WITHOUT WARRANTY OF ANY KIND, EITHER
+EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE
+QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE
+DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+### 16. Limitation of Liability.
+
+IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY
+COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS
+PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL,
+INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE
+OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE
+WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+### 17. Interpretation of Sections 15 and 16.
+
+If the disclaimer of warranty and limitation of liability provided above cannot be
+given local legal effect according to their terms, reviewing courts shall apply local
+law that most closely approximates an absolute waiver of all civil liability in
+connection with the Program, unless a warranty or assumption of liability accompanies
+a copy of the Program in return for a fee.
\ No newline at end of file
diff --git a/README.md b/README.md
index c8f3d64..69111be 100644
--- a/README.md
+++ b/README.md
@@ -1,15 +1,82 @@
# raven-python
-The Python counterpart of the
-[RAVEN Toolbox 2](https://github.com/SysBioChalmers/RAVEN) (MATLAB), built on
-[cobrapy](https://github.com/opencobra/cobrapy).
-
-`raven-python` covers de-novo reconstruction (KEGG + protein homology),
-context-specific model extraction (`tINIT` / `ftINIT`), metabolic-task
-validation, gap-filling, omics ingestion, sub-cellular localisation, model
-manipulation, and YAML / SIF / Excel I/O — preserving the established RAVEN
-workflows in a Python-native form.
-
-This `main` branch is intentionally empty. Development happens on the
-`develop` branch via a series of feature branches; see the open and merged
-pull requests for the current state of the port.
+[](https://github.com/SysBioChalmers/raven-python/actions/workflows/ci.yml)
+
+**Reconstruction, Analysis and Visualisation of Metabolic Networks — in Python.**
+
+`raven-python` is the Python counterpart of the
+[RAVEN Toolbox 2](https://github.com/SysBioChalmers/RAVEN) (MATLAB). It builds on
+[**cobrapy**](https://github.com/opencobra/cobrapy) for everything cobrapy already does
+well (simulation, standard analyses, SBML I/O, model manipulation) and adds the
+functionality that's unique to RAVEN:
+
+* **De novo reconstruction** from KEGG and protein homology (BLAST / DIAMOND).
+* **Context-specific models** from omics data via **tINIT / ftINIT**, with task-aware
+ gap-filling and the linear-merge MILP reduction.
+* **Metabolic-task** validation (`check_tasks`, `fitTasks`).
+* **Connectivity gap-filling** against template models.
+* **Omics integration** — Human Protein Atlas (proteomics + RNA-seq) ingestion.
+* **Sub-cellular localisation** prediction by MILP, with partial-update mode and
+ pluggable predictors (WoLF PSORT, DeepLoc, …).
+* **N-model comparison**; **reporter metabolites**; **FSEOF**; **flux sampling**.
+* **YAML I/O** following the cobra standard, plus geckopy's `ec-*` enzyme-constrained
+ fields. **SIF** export. **RAVEN-style Excel** export.
+
+The status of every RAVEN function (ported, cheatsheet-mapped to cobra, or explicitly
+not ported) is documented function-by-function in
+**[docs/raven_migration.md](docs/raven_migration.md)**.
+
+## Design principle
+
+The canonical in-memory object is always a [`cobra.Model`](https://cobrapy.readthedocs.io).
+There is no parallel RAVEN struct, no `ravenCobraWrapper`-style adapter. RAVEN-specific
+fields that cobra doesn't model natively (`rxnMiriams`, `metDeltaG`,
+`rxnConfidenceScores`, …) live in cobra's `annotation` / `notes` dictionaries. This
+avoids duplicating cobra's data model and keeps raven-python interoperable with the wider
+COBRA ecosystem.
+
+## Status
+
+raven-python has been validated against MATLAB RAVEN on **Human-GEM** (5 Hart2015 cell-line
+models, Jaccard 0.975–0.980 — see [docs/humangem_validation.md](docs/humangem_validation.md)).
+The functional scope of the original RAVEN toolbox is covered with two principled
+omissions:
+
+* **MetaCyc-based reconstruction** is not implemented and is flagged for removal from
+ MATLAB RAVEN as well — see [IMPROVEMENTS.md](IMPROVEMENTS.md) under `R-MetaCyc`.
+* **Dynamic FBA** is not implemented — several maintained Python packages already cover
+ it ([`dfba`](https://pypi.org/project/dfba/), [`reframed`](https://pypi.org/project/reframed/),
+ [`mewpy`](https://pypi.org/project/mewpy/)).
+
+What's still open is catalogued in **[docs/todo.md](docs/todo.md)** (visualisation / Phase
+6 is the main item).
+
+## Installation (development)
+
+```bash
+git clone https://github.com/SysBioChalmers/raven-python
+cd raven-python
+pip install -e ".[dev]"
+```
+
+raven-python requires Python ≥ 3.11. Genome-scale (f)tINIT MILPs currently require **Gurobi**
+([details on solver portability](docs/init_solver_benchmark.md)); toy and unit-test work
+runs on the open-source GLPK.
+
+## Documentation
+
+See **[docs/README.md](docs/README.md)** for the documentation index.
+
+## Relationship to MATLAB RAVEN
+
+`raven-python` is a derivative work and is released under the same **GPL-3.0-or-later**
+license. If you use it in scientific work, please cite the RAVEN 2 paper:
+
+> Wang H, Marcišauskas S, Sánchez BJ, Domenzain I, Hermansson D, Agren R, Nielsen J,
+> Kerkhoven EJ. (2018) RAVEN 2.0: A versatile toolbox for metabolic network
+> reconstruction and a case study on *Streptomyces coelicolor*. PLoS Comput Biol 14(10):
+> e1006541.
+
+## License
+
+[GPL-3.0-or-later](LICENSE)
diff --git a/docs/humangem_validation.md b/docs/humangem_validation.md
new file mode 100644
index 0000000..bbaae6b
--- /dev/null
+++ b/docs/humangem_validation.md
@@ -0,0 +1,117 @@
+# Human-GEM cell-type model validation: raven-python vs RAVEN
+
+Validation of raven-python's tINIT/ftINIT against MATLAB RAVEN on a real genome-scale
+reconstruction (Human-GEM) using the Hart2015 RNA-seq dataset (5 cell lines: DLD1,
+GBM, HCT116, HELA, RPE1). The goal is functional equivalence — do raven-python and RAVEN
+extract the *same* context-specific reaction sets from the same inputs?
+
+## Method
+
+* **Template & inputs.** RAVEN built the ftINIT reference model from Human-GEM
+ (`prepHumanModelForftINIT`: remove drug/exchange/artificial reactions, set
+ spontaneous/custom lists) and exported it as `raven_refModel.xml` (10198 reactions).
+ raven-python builds on that *same* exported model, so the candidate reaction universe is
+ identical and set comparison is exact.
+* **Scoring.** Gene scores from `log2(TPM+1)`-style expression via
+ `gene_scores_from_expression`, mapped to reactions through the GPR
+ (`score_reactions_from_genes`), matching RAVEN's `getExprForRxnScore`.
+* **ftINIT.** Series `1+1` (2 staged MILP steps). RAVEN run via `ftINIT.m` with Gurobi;
+ raven-python via `raven_python.init.ftinit` with Gurobi (`mip_gap=0.001`, `time_limit=600`).
+* **tINIT.** raven-python `get_init_model` (classic single-MILP INIT) on HCT116, compared to
+ the ftINIT result for the same cell line.
+* **Tasks.** Two raven-python ftINIT variants: *no-task* (expression only) and
+ *task-constrained* (essential metabolic tasks, `metabolicTasks_Essential.txt`, force
+ task-essential reactions to be kept). RAVEN's reference is task-constrained.
+* **Solver.** Gurobi 13.0.1 for both tools.
+
+## Engineering findings (raven-python tractability)
+
+Getting ftINIT to run at genome scale surfaced three issues, all now fixed and matching
+RAVEN's design:
+
+1. **O(n²) constraint construction.** Building the steady-state balances with Python
+ `sum()` re-canonicalises a growing sympy expression at each term; hub metabolites
+ (ATP/H⁺/H₂O in ~10³ reactions) made one constraint take ~minutes (≈154 s total build,
+ benchmark: 1500-term `sum` = 59 s vs `optlang.symbolics.add` = 0.01 s). Fixed by
+ building flat term lists once per reaction and summing with `optlang.symbolics.add`
+ (in both ftINIT and tINIT).
+2. **Big-M too loose.** The on/off indicator constraints used each reaction's own bound
+ (±1000) as big-M; with `force_on=0.1` that is a ~10⁴ ratio → very weak LP relaxation
+ → Gurobi never closes the gap. RAVEN uses a fixed big-M = 100. Adopted.
+3. **Stoichiometric rescaling.** A fixed big-M=100 is only valid if no reaction needs
+ flux ≫100; ported RAVEN's `rescaleModelForINIT` (cap each reaction's coefficient
+ dynamic range at 25×, normalise mean |coeff| to 1) into `prep_init_model`. Without it
+ the staged MILP is infeasible (step-1 caps transports that step-0 used freely).
+
+Net effect: a full ftINIT cell-line solve went from *not finishing* to ~200 s,
+comparable to RAVEN.
+
+## Results
+
+### Reaction counts
+
+| cell line | RAVEN ftINIT | raven-python ftINIT (no-task) | raven-python ftINIT (task) |
+|-----------|-------------:|--------------------------:|-----------------------:|
+| DLD1 | 7782 | 7744 | 7774 |
+| GBM | 7668 | 7667 | 7680 |
+| HCT116 | 7780 | 7752 | 7776 |
+| HELA | 7832 | 7789 | 7816 |
+| RPE1 | 7569 | 7564 | 7570 |
+
+Counts agree within ~0.5 % everywhere; the task-constrained run is closest (e.g. RPE1
+7570 vs 7569, HCT116 7776 vs 7780). raven-python tINIT (HCT116) gives 6024 reactions — a
+smaller model, as expected from the different (classic INIT) objective.
+
+### Agreement — raven-python (no-task) ftINIT vs RAVEN ftINIT
+
+| cell line | shared | only raven-python | only RAVEN | Jaccard |
+|-----------|-------:|--------------:|-----------:|--------:|
+| DLD1 | 7667 | 77 | 115 | 0.976 |
+| GBM | 7562 | 105 | 106 | 0.973 |
+| HCT116 | 7675 | 77 | 105 | 0.977 |
+| HELA | 7707 | 82 | 125 | 0.974 |
+| RPE1 | 7470 | 94 | 99 | 0.975 |
+
+**~97.5 % of reactions are identical** between the two independent implementations, even
+though this run is *expression-only* while RAVEN's reference is task-constrained. The
+"only RAVEN" surplus (≈99–125) is expected to include task-essential reactions that the
+task-constrained run (below) recovers.
+
+### Agreement — raven-python (task-constrained) ftINIT vs RAVEN ftINIT
+
+| cell line | shared | only raven-python | only RAVEN | Jaccard |
+|-----------|-------:|--------------:|-----------:|--------:|
+| DLD1 | 7699 | 75 | 83 | 0.980 |
+| GBM | 7588 | 92 | 80 | 0.978 |
+| HCT116 | 7696 | 80 | 84 | 0.979 |
+| HELA | 7735 | 81 | 97 | 0.978 |
+| RPE1 | 7493 | 77 | 76 | 0.980 |
+
+Adding the essential metabolic tasks (same task list RAVEN uses) raises agreement to
+**Jaccard 0.978–0.980** and makes the disagreement symmetric (only-raven-python ≈ only-RAVEN
+≈ 80), confirming the prediction: the task constraints recover RAVEN's task-essential
+reactions. The residual ≈80 reactions each way out of ~7700 is at the level expected from
+MIP-gap tolerance (both accept near-optimal incumbents) and alternate optima.
+
+### raven-python tINIT vs ftINIT (HCT116)
+
+tINIT 6024 rxns vs ftINIT 7752; shared 5957, Jaccard 0.762. tINIT is nearly a subset
+(only 67 reactions unique to it) — the two methods agree on a common core, with ftINIT
+keeping more (its staged formulation and task handling are less aggressive at removal).
+This matches the expected tINIT/ftINIT relationship rather than indicating a defect.
+
+## Conclusions
+
+From identical inputs on a genome-scale human reconstruction, raven-python reproduces RAVEN's
+ftINIT reaction selection to **97.5 % (no-task) and 98 % (task-constrained) set identity**
+across five cell lines — strong evidence of functional equivalence between the two
+independent implementations. Agreement is symmetric and the residual (~80 reactions each
+way) is consistent with MIP near-optimality and alternate optima rather than any
+systematic divergence.
+
+Reaching genome-scale tractability required matching RAVEN's numerical-conditioning
+choices and fixing optlang-specific construction costs (see *Engineering findings*):
+fixed big-M = 100, `rescaleModelForINIT`, `optlang.symbolics.add` instead of Python
+`sum()` in every MILP build (ftINIT, tINIT, and the gap-fill). With these, a
+task-constrained cell-line model builds in ~15–25 min (dominated by the
+essential-forced staged MILP) and a no-task one in ~3 min, comparable to RAVEN.
diff --git a/docs/init_param_calibration.md b/docs/init_param_calibration.md
new file mode 100644
index 0000000..cc69314
--- /dev/null
+++ b/docs/init_param_calibration.md
@@ -0,0 +1,342 @@
+# (f)tINIT parameter calibration & input-robustness
+
+Empirical study of raven-python's (f)tINIT parameters on a genome-scale model (Human-GEM,
+Hart2015 / HCT116). Two questions:
+
+1. **Calibration** — on clean data, which parameter values give the best speed/quality
+ trade-off? (`scripts/analyze_init_params.py`)
+2. **Robustness** — with the task layer always on (it is part of the pipeline, not a
+ variable), how does degrading the *transcriptomics input* affect the model, and which
+ parameters keep it functional and stable? (`scripts/analyze_init_robustness.py`)
+
+Both scripts are resumable and reusable on any model/dataset; the numbers below are HCT116.
+"Jaccard" is reaction-set overlap with the reference (tightest setting / clean data) — for
+a model-extraction tool the reaction set is the product, and a MIP gap bounds only the
+*objective*, so set-stability is tracked separately.
+
+---
+
+## 1. Clean-data calibration
+
+### ftINIT MILP — `mip_gap` (single step-0 solve, big_m=100, force_on=0.1)
+
+| mip_gap | time (s) | objective | rel.obj.gap | Jaccard vs tightest |
+|--------:|---------:|----------:|------------:|--------------------:|
+| 0.0002 | 48 | 49357 | ref | ref |
+| 0.001 | 44 | 49357 | +0.0000 | **1.0000** |
+| 0.003 | 42 | 49289 | +0.0014 | 0.9973 |
+| 0.01 | 42 | 49185 | +0.0035 | 0.9935 |
+| 0.03 | 52 | 49185 | +0.0035 | 0.9935 |
+| 0.1 | 46 | 45615 | +0.0758 | 0.9469 |
+
+**Solve time is essentially flat across the gap** (the model build dominates), so a tight
+gap is nearly free. `mip_gap=0.001` reproduces the proven optimum exactly (Jaccard 1.0);
+quality only collapses at 0.1. → **Default 0.001.** (The genome-scale staged pipeline still
+needs *some* gap + a `time_limit` because the full essential-forced MILP can be much harder
+than this single step — see robustness timings.)
+
+### ftINIT MILP — `big_m` (gap=0.001, force_on=0.1)
+
+| big_m | time (s) | rel.obj.gap | Jaccard vs big_m=100 |
+|------:|---------:|------------:|---------------------:|
+| 100 | 51 | ref | ref |
+| 50 | 54 | +0.0006 | 0.983 |
+| 25 | 53 | +0.0007 | 0.982 |
+| 250 | 55 | +0.0005 | 0.984 |
+| 1000 | 59 | +0.0001 | 0.986 |
+
+At step-0 (on the *scaled* model) `big_m` barely affects objective or time, but shifts which
+reactions are kept by ~2% (alternate optima). `big_m=100` is RAVEN's value and is required
+for the *staged* pipeline to stay feasible (a fixed 100 is only valid with stoichiometric
+rescaling — see §1.4). → **Default 100.**
+
+### ftINIT MILP — `force_on` (gap=0.001, big_m=100)
+
+| force_on | time (s) | rel.obj.gap | Jaccard vs 0.1 |
+|---------:|---------:|------------:|---------------:|
+| 0.1 | 63 | ref | ref |
+| 0.02 | 69 | +0.0005 | 0.983 |
+| 0.05 | 56 | +0.0000 | 0.990 |
+| 0.2 | 59 | +0.0004 | 0.982 |
+| 0.5 | 79 | +0.0005 | 0.985 |
+
+`force_on` (minimum flux for a reaction to count as "on") changes the *model*, not just a
+tolerance, but the reaction set is fairly insensitive (Jaccard ≥0.98) and the objective
+hardly moves. → **Default 0.1** (RAVEN), no strong reason to change.
+
+### prep scaling — `rescaleModelForINIT` `max_stoich_diff` and on/off (gap=0.001, big_m=100)
+
+| config | time (s) | rel.obj.gap | Jaccard vs scaled msd=25 |
+|--------|---------:|------------:|-------------------------:|
+| scale on, msd=25 | 51 | ref | ref |
+| msd=10 | 49 | +0.0075 | 0.989 |
+| msd=50 | 61 | +0.0003 | 0.982 |
+| msd=100 | 62 | −0.0001 | 0.986 |
+| scale off | 45 | +0.0129 | 0.973 |
+
+At step-0 even `scale=off` is feasible, but it drifts most (Jaccard 0.973, objective +1.3%);
+`max_stoich_diff` 10–100 are all within ~1%. **This understates scaling's importance** — at
+step-0 there is no big-M cap on the held-out transports. In the *full staged pipeline*,
+`scale=off` with `big_m=100` is **infeasible** (step-1 caps transports that step-0 used
+freely). → **Keep scaling on, msd=25** (RAVEN's default).
+
+**Calibration summary (defaults are well-chosen):** `mip_gap=0.001`, `big_m=100`,
+`force_on=0.1`, scaling on (`max_stoich_diff=25`). For the genome-scale staged pipeline also
+set a `time_limit` (≈120–600 s/step) so a hard essential-forced step returns a near-optimal
+incumbent rather than grinding.
+
+### tINIT MILP (`get_init_model`, `essential_rxns=[]`, `time_limit=400s`)
+
+**mip_gap** (eps=1, prod_weight=0.5):
+
+| mip_gap | time (s) | n_kept | Jaccard vs gap=0.001 |
+|--------:|---------:|-------:|---------------------:|
+| 0.001 | 901 | 6024 | ref |
+| 0.003 | 869 | 6036 | 0.991 |
+| 0.01 | 595 | 5967 | 0.968 |
+
+Tightening the gap costs ~50% more wall time on this MILP (unlike ftINIT step-0, build
+doesn't dominate); a 1% gap is ~30% faster with ~3% reaction-set drift.
+→ **`mip_gap=0.001`** for stability, **0.01** for a faster looser solve.
+
+**eps** (gap=0.005, the connectivity-flux threshold — *changes the model*):
+
+| eps | n_kept | Jaccard vs eps=1.0 |
+|----:|-------:|-------------------:|
+| 0.1 | 6119 | 0.952 |
+| 0.5 | 6123 | 0.952 |
+| 1.0 | 6064 | ref |
+| 2.0 | 6090 | 0.960 |
+
+Each `eps` value gives a slightly different model (Jaccard ~0.95 across the range); the
+reaction-set spread is ~5%. `eps=1.0` is RAVEN's default; smaller values produce *slightly*
+larger models (loosen the connectivity bar). Pick by what the data justifies — see the
+caveat at the top of `init.py`.
+
+**prod_weight** (gap=0.005, the metabolite-production reward — *changes the model*):
+
+| prod_weight | n_kept | Jaccard vs 0.5 |
+|------------:|-------:|---------------:|
+| 0.0 | 5973 | 0.961 |
+| 0.25 | 6015 | 0.974 |
+| 0.5 | 6064 | ref |
+| 1.0 | 6106 | 0.955 |
+
+A higher `prod_weight` keeps slightly more reactions (rewards more connectivity); `0.5`
+(RAVEN's default) is the middle of the range. Effect is modest (~5%).
+
+**big_m** (gap=0.005, `None` = per-reaction `ub`):
+
+| big_m | n_kept | Jaccard vs None |
+|------:|-------:|----------------:|
+| None (per-rxn ub) | 6064 | ref |
+| 1000 | 6064 | **1.000** |
+| 250 | 6114 | 0.953 |
+| 100 | 6023 | 0.930 |
+
+`big_m=1000` is identical to `big_m=None` here because the model's `ub` is ±1000 already
+(so the per-reaction cap *is* 1000). Smaller fixed caps (250, 100) shift alternate optima
+by 5–7% but do not change the objective. Unlike ftINIT, tINIT has *not* been run through
+`rescaleModelForINIT`, so dropping `big_m` below 1000 may invalidate the LP feasibility
+region for reactions whose own bound is larger — keep the default (per-reaction `ub`).
+
+**tINIT calibration summary:** `mip_gap=0.001` (or 0.01 for ~30% speedup at ~3% drift);
+`eps`, `prod_weight`, `big_m` defaults are fine — they all change the *model*, not just
+tolerance, so tune by what the data and biology call for, not by these tables.
+
+### ftINIT full pipeline (`ftinit`, series='1+1', no-task scaled prep, `time_limit=600s`)
+
+| config | time (s) | n_kept | Jaccard vs gap=0.001 |
+|--------|---------:|-------:|---------------------:|
+| **mip_gap=0.001** (default big_m=100) | 346 | 7752 | ref |
+| mip_gap=0.003 | 288 | 7748 | 0.993 |
+| mip_gap=0.01 | 218 | 7746 | **0.995** |
+| big_m=50 (gap=0.003) | 738 | 7799 | 0.974 |
+| big_m=250 (gap=0.003) | 345 | 7766 | 0.977 |
+
+Unlike the single-step ftINIT MILP in §1.1 (where build time dominated and the gap was
+free), **the full pipeline does benefit from a looser gap**: `mip_gap=0.01` is ~37 %
+faster than `0.001` with Jaccard 0.995 — essentially the same model. → **For genome-scale
+ftINIT, `mip_gap=0.01` (or 0.005) is the sweet spot**; keep 0.001 only if exact
+reproducibility matters more than a few minutes.
+
+`big_m=50` is actually *slower* than the default 100 (738s vs 346s) — a tighter cap makes
+the LP relaxation harder for borderline reactions; `big_m=250` is the same speed as 100
+but shifts the reaction set ~2 %. → **Keep `big_m=100`** (RAVEN's value, what scaling is
+designed for).
+
+### tINIT + many task-essential reactions: a structural limitation
+
+ftINIT's task layer (gap-fill) and tINIT's task layer (forcing `essential_rxns`) are
+*not equivalent*. tINIT forces every essential reaction to carry `flux ≥ eps`. With
+Human-GEM's 113 task-essential reactions (the validation set), the resulting steady-state
+system is infeasible regardless of `eps`:
+
+| essentials passed to `run_init` | result |
+|---|---|
+| 0 (the original validation call) | ✅ ok, 6024 reactions |
+| 113 (merged-survivor IDs from `prep.essential_rxns`) | ❌ `infeasible` (proven by Gurobi presolve, ~330s) |
+| 260 (pre-merge IDs from `find_task_essential_reactions` cache) | ❌ `infeasible` (~480s) |
+
+Lowering `eps` (1.0 → 0.1) does **not** fix it; the issue is that 100+ reactions cannot
+simultaneously each carry a fixed positive flux in their forced direction at steady state.
+ftINIT avoids this by using an *adaptive* per-reaction forcing magnitude
+(`min(0.99·|previous flux|, force_on)`) so each essential is forced at a value it
+*actually carried* in a prior feasible solution. tINIT's one-size-fits-all `eps`
+mechanism doesn't have that escape hatch.
+
+**Practical takeaway.** For functional context-specific models on genome-scale data, use
+ftINIT — the task layer (gap-fill, adaptive essential forcing) is what makes the pipeline
+robust. tINIT remains useful for the small/no-essentials case (e.g. the
+expression-only baseline in the validation), but pairing it with the full task-essential
+set is a known incompatibility; the tINIT robustness study below is therefore reported
+with `essential_rxns=[]`.
+
+---
+
+## 2. Robustness to degraded transcriptomics (task layer always on)
+
+The metabolic-task + gap-fill layer is held fixed; only the expression input is degraded.
+`frac` = fraction of the 69 essential tasks the extracted model performs (`check_tasks`);
+`Jaccard` = reaction-set overlap with the clean-data model.
+
+| input | n_rxns | tasks pass | frac | Jaccard vs clean |
+|-------|-------:|-----------:|-----:|-----------------:|
+| **clean** | 7777 | 69/69 | 1.000 | ref |
+| dropout 50% | 5968 | 67/69 | 0.971 | **0.713** |
+| dropout 70% | 5113 | 68/69 | 0.986 | **0.594** |
+| noise σ=1.0 | 7812 | 69/69 | 1.000 | 0.952 |
+| noise σ=2.0 | 7768 | 69/69 | 1.000 | 0.919 |
+| downsample 50% | 6765 | 68/69 | 0.986 | 0.815 |
+| downsample 70% | 6123 | 68/69 | 0.986 | 0.728 |
+
+(dropout = genes set to 0 → score −5; noise = ×`exp(N(0,σ))`; downsample = genes dropped →
+`no_gene_score`.)
+
+**Findings:**
+
+* **Robust to noise, sensitive to sparsity.** Multiplicative expression noise barely changes
+ the model (Jaccard 0.92–0.95, size stable, all tasks pass). Sparsity is far more damaging:
+ 50% dropout already drops the reaction set to **0.71 Jaccard** (and shrinks 7777→5968), 70%
+ to **0.59**.
+* **Sparsity shrinks the model toward the task-essential core.** Missing/zeroed genes remove
+ the expression evidence for a reaction; the task layer only adds back what tasks require, so
+ sparse input yields smaller, more "generic" models. Dropout (−5) is harsher than
+ downsampling (−2).
+* **Functionality is largely but not perfectly preserved.** With the task layer, `frac` stays
+ ≥0.97, but dips to 67–68/69 under heavy sparsity — i.e. the bounded gap-fill plus the
+ post-hoc low-score-gene pruning occasionally leave 1–2 essential tasks unsatisfied. (See the
+ lever sweep below for whether `no_gene_score`/`force_on` recover them.)
+* **Cost tracks damage.** Dropout runs are slowest (more broken tasks → more gap-fill);
+ noise is cheap.
+
+> **Tractability note (a parameter that prevents failure):** the gap-fill MILP must be bounded
+> (`mip_gap`/`time_limit`). Unbounded, severe degradation (which breaks many tasks at once)
+> makes it solve a hard min-cost MILP per broken task to proven optimality — observed to run
+> >75 min for one 90%-dropout model. With the bound it returns a near-optimal fill quickly.
+
+### Levers at dropout 70% — which parameter best stabilises the model?
+
+| config | n_rxns | frac | Jaccard vs clean |
+|--------|-------:|-----:|-----------------:|
+| default (no_gene_score=−2, force_on=0.1) | 5113 | 0.986 | 0.594 |
+| no_gene_score=−1.0 | 5110 | 0.986 | 0.593 |
+| no_gene_score=−0.5 | 5128 | 0.986 | 0.593 |
+| force_on=0.2 | 5159 | 0.986 | 0.600 |
+
+**No lever recovers the drift** — Jaccard stays ~0.59 across all settings. Two reasons,
+both informative:
+
+* The information dropout destroys is simply gone; no scoring/connectivity knob reconstructs
+ the missing expression evidence. You cannot tune your way out of sparse input.
+* `no_gene_score` is the wrong knob *for dropout specifically*: dropout leaves genes
+ *present but zero* (scored −5), whereas `no_gene_score` only governs reactions whose genes
+ are **absent** from the data — i.e. the *downsampling* failure mode. So `no_gene_score` is
+ a meaningful lever for missing-data sparsity (a less-negative value keeps more
+ unmeasured reactions, growing the model back toward clean), but it has nothing to act on
+ under dropout.
+
+**Practical takeaway.** The robustness levers that matter are *structural*, not numeric: the
+task + gap-fill layer (keeps the model functional regardless of input quality) and a bounded
+gap-fill MILP (keeps it tractable). For *missing*-gene sparsity specifically, `no_gene_score`
+trades model size against confidence. For noise, defaults are already robust. No parameter
+restores fidelity lost to dropout — that is a property of the data, not the pipeline.
+
+### tINIT robustness — `essential_rxns=[]` (the tINIT-without-task-layer picture)
+
+For the reasons in §1.5, tINIT cannot accept the full task-essential set as forced
+reactions; this section runs `get_init_model` with `essential_rxns=[]` to show the
+realistic tINIT behaviour on the same degradation gradient — i.e. the *cost of not
+having ftINIT's gap-fill safety net*.
+
+| input | n_rxns | tasks pass | frac | Jaccard vs clean |
+|-------|-------:|-----------:|-----:|-----------------:|
+| **clean** | 6277 | **35/69** | **0.507** | ref |
+| dropout 50% | 4910 | 23/69 | 0.333 | 0.673 |
+| dropout 70% | 2807 | 21/69 | 0.304 | 0.408 |
+| noise σ=1.0 | 6661 | 25/69 | 0.362 | 0.878 |
+| noise σ=2.0 | 6146 | 24/69 | 0.348 | 0.869 |
+| downsample 50% | 5006 | 24/69 | 0.348 | 0.722 |
+| downsample 70% | 3541 | 19/69 | 0.275 | 0.515 |
+
+**The headline contrast with ftINIT:**
+
+| | ftINIT (task layer) | tINIT (no task layer) |
+|---|---|---|
+| clean | 7777 rxns, **69/69 (1.000)** | 6277 rxns, **35/69 (0.507)** |
+| dropout 0.7 | 5113 rxns, **68/69 (0.986)**, J 0.594 | 2807 rxns, **21/69 (0.304)**, J 0.408 |
+| noise σ=2.0 | 7768 rxns, **69/69 (1.000)**, J 0.919 | 6146 rxns, **24/69 (0.348)**, J 0.869 |
+| downsample 0.7 | 6123 rxns, **68/69 (0.986)**, J 0.728 | 3541 rxns, **19/69 (0.275)**, J 0.515 |
+
+* tINIT-without-gap-fill fails roughly **half the essential tasks even on clean data**;
+ ftINIT-with-gap-fill passes them all. Under degradation tINIT collapses further (down
+ to 19/69 at 70 % downsample), ftINIT stays ≥67/69 throughout.
+* **Reaction-set drift is comparable** under noise (Jaccard 0.87 vs 0.92) but worse for
+ tINIT under sparsity (0.41 vs 0.59 at 70 % dropout) because there's no gap-fill to
+ re-add structurally needed reactions.
+
+This is *not* a critique of the tINIT algorithm — classic INIT was designed for the
+no-task-layer case. It is the empirical evidence for why ftINIT's design choices (task
++ gap-fill, adaptive essential forcing) are the right ones for genome-scale tissue
+model extraction, and why tINIT is mostly useful here as a baseline.
+
+#### tINIT levers at dropout 70%
+
+| config | n_rxns | tasks pass | frac | Jaccard vs clean |
+|--------|-------:|-----------:|-----:|-----------------:|
+| default (prod_weight=0.5, eps=0.1) | 2807 | 21/69 | 0.304 | 0.408 |
+| prod_weight=0.0 | 2791 | 21/69 | 0.304 | 0.416 |
+| prod_weight=1.0 | 3386 | 22/69 | 0.319 | 0.485 |
+| prod_weight=2.0 | 3888 | 21/69 | 0.304 | 0.458 |
+| eps=0.5 | 2620 | 21/69 | 0.304 | 0.391 |
+| eps=1.0 | 3311 | 22/69 | 0.319 | 0.460 |
+
+Same conclusion as the ftINIT levers: parameter tuning can nudge (`prod_weight≥1.0`
+or a larger `eps` modestly grows the model and lifts Jaccard from 0.41 to ~0.48), but
+**no tINIT parameter recovers anything close to ftINIT's functionality** (22/69 at best
+vs ftINIT's 67–69/69 at the same dropout). The gap-fill layer, not the parameter
+choice, is what bridges the gap.
+
+---
+
+## 3. Cross-solver portability
+
+See [init_solver_benchmark.md](init_solver_benchmark.md) for the genome-scale
+solver comparison (Gurobi/HiGHS/GLPK) and [tests/test_init_solvers.py](../tests/test_init_solvers.py)
+for CI parameterised over installed MILP backends. Headline: at genome scale only Gurobi
+is viable today; HiGHS fails on an upstream optlang `hybrid_interface.clone()` bug; GLPK
+ignores `configuration.timeout` on MIP and ran 1 h+ without converging. Toy-scale
+correctness is portable (Gurobi + GLPK give identical verdicts on the unit-test
+networks), so local development works without a Gurobi licence.
+
+---
+
+## Reproducing
+
+```bash
+python scripts/analyze_init_params.py --cell HCT116 --sweeps ftinit_milp,prep_scale,tinit,ftinit_full
+python scripts/analyze_init_robustness.py --cell HCT116 --algo ftinit # then --algo tinit
+```
+
+Both reuse the cached Human-GEM preps from the validation run
+([docs/humangem_validation.md](humangem_validation.md)) and are resumable.
diff --git a/docs/init_solver_benchmark.md b/docs/init_solver_benchmark.md
new file mode 100644
index 0000000..1cd97ac
--- /dev/null
+++ b/docs/init_solver_benchmark.md
@@ -0,0 +1,67 @@
+# Cross-solver ftINIT benchmark — Human-GEM / HCT116
+
+Same `ftinit()` call (no-task scaled prep; `mip_gap=0.001`, `time_limit=900s`) run with each
+installed MILP-capable optlang interface. Generated by `scripts/analyze_init_solvers.py`;
+companion to the CI-scale `tests/test_init_solvers.py`.
+
+## Per-solver result
+
+| solver | time (s) | status | n_rxns |
+|--------|---------:|--------|-------:|
+| **gurobi** | 518 | ✅ ok | 7752 |
+| **hybrid** (HiGHS) | 55 | ❌ FAIL: `ValueError: LP Method primal is not valid (choose one of: auto, simplex, interior point)` | 0 |
+| **glpk** | 3672 | ❌ FAIL: did not converge in 1 h+ (`configuration.timeout` not honored by GLPK MIP) | 0 |
+
+> Wall clocks on Gurobi 13.0.1, optlang 1.x, cobra; one Human-GEM HCT116 cell line.
+
+## Findings
+
+* **Gurobi** is the only MILP backend that actually completes ftINIT on Human-GEM here:
+ ~9 min for 7752 reactions (matches the [validation](humangem_validation.md) result).
+ All our tractability tuning (big-M=100, `rescaleModelForINIT`, `mip_gap`,
+ `time_limit`) was done on Gurobi and it pays off.
+* **HiGHS** (`hybrid_interface`) **does not work with cobra at all in this stack** — not
+ raven-python's bug. Cobra sets `model.solver = "hybrid"` which calls
+ `optlang.interface.Model.clone()`, which re-applies a stored `lp_method="primal"`
+ parameter that the `hybrid_interface.Configuration` rejects (it accepts only
+ `auto/simplex/interior point`). This breaks `model.copy()` and any flow that swaps
+ the solver — i.e. the whole pipeline. The same failure mode shows up at toy scale in
+ `tests/test_init_solvers.py` (5/5 fail), so CI catches it now. Upstream optlang/cobra
+ patch needed; nothing to fix in raven-python.
+* **GLPK** loads the model but its MIP solver does **not honor
+ `configuration.timeout`** for this problem — we set the 900 s wall limit, GLPK still
+ ran 1 h+ at 100 % CPU without producing a solution and had to be killed. GLPK has no
+ licensing burden but is not a viable MILP backend at genome scale for ftINIT in
+ practice.
+
+## Practical implications
+
+* **Production / genome-scale ftINIT requires Gurobi** today. We should be explicit
+ about this in the package docs (license-encumbered dependency) until either the
+ optlang `hybrid_interface` clone bug is fixed or GLPK gains usable MIP time-limit
+ support.
+* **Toy / unit-test correctness is portable.** `tests/test_init_solvers.py` shows Gurobi
+ and GLPK give identical verdicts on the toy ftINIT/tINIT networks; the formulation
+ itself is solver-independent. Local development and CI work without a Gurobi license;
+ only the genome-scale runs need it.
+* **Future portability work** is two concrete upstream fixes:
+ 1. optlang `hybrid_interface.Configuration` should accept (or remap) the `lp_method`
+ parameter values that the generic clone path emits, or the clone path should drop
+ unknown LP-method values gracefully.
+ 2. GLPK's MIP solve should honor `configuration.timeout`. If upstream won't,
+ raven-python could implement a watchdog (separate thread sending `SIGINT` after the
+ wall limit) specifically when the solver is GLPK.
+
+## Reproducing
+
+```bash
+# CI parameterised tests (seconds, runs always):
+python -m pytest tests/test_init_solvers.py -v
+
+# Genome-scale benchmark (minutes-to-hours, manual):
+python scripts/analyze_init_solvers.py --cell HCT116 \
+ --doc docs/init_solver_benchmark.md
+```
+
+Both reuse the cached Human-GEM no-task prep from the validation run
+([humangem_validation.md](humangem_validation.md)) and are resumable per solver.
diff --git a/docs/kegg_data_format.md b/docs/kegg_data_format.md
new file mode 100644
index 0000000..efb6d13
--- /dev/null
+++ b/docs/kegg_data_format.md
@@ -0,0 +1,72 @@
+# KEGG relational-table storage format
+
+This note records *why* raven-python stores its KEGG-derived relational tables as
+**gzipped TSV**, and what other options we deliberately deferred. It applies to
+the maintainer-built KEGG artefacts described in PLAN.md §2.3b — the `ko_reaction`,
+`organism_gene_ko`, KO-name, and reaction-flag tables.
+
+The reference GEM itself is stored as **gzipped RAVEN/cobra YAML**
+(`reference_model.yml.gz`) — RAVEN-native and MATLAB-readable, gzipped to match the
+tables (the YAML I/O transparently gzips on a `.gz` suffix). On the real KEGG dump
+this is ~1.1 MB (vs ~30 MB as SBML) for the full 12k-reaction gene-free model.
+
+End users do not build any of this: the published artefacts are fetched and cached
+under `~/.cache/raven-python/data/kegg-/` by `ensure_data` (see
+`raven_python.data`), mirroring how binaries are provisioned.
+
+## Decision (current)
+
+- **Small tables** (`ko_reaction`, `ko_names`, `rxn_flags`): **gzipped TSV
+ (`.tsv.gz`)**. Each is well under 1 MB, so compression choice is irrelevant;
+ gzip keeps them MATLAB-native and dependency-free.
+- **The large `organism_gene_ko` table**: **xz-compressed TSV
+ (`organism_gene_ko.tsv.xz`), with rows sorted by `(organism, gene)`**.
+
+Why the large table differs. It carries KEGG's ~9M gene↔KO associations and
+dominates the artefact set (≈78 MB as unsorted gzipped TSV). Two cheap,
+stdlib-only changes cut that to ≈27 MB (2.9×):
+
+1. **Sort by `(organism, gene)`** before writing. Gene IDs from one organism
+ share long common prefixes (locus tags, numeric runs); sorting makes them
+ adjacent so the compressor can fold them. This alone takes 78 → 48 MB and
+ happens to match the by-organism query pattern in
+ `get_kegg_model_for_organism`. The sort is an external merge sort bounded to
+ `chunk_rows` in memory (see `stream_organism_gene_ko`), so it stays scalable.
+2. **xz instead of gzip** (Python stdlib `lzma`). Its larger dictionary captures
+ cross-row redundancy gzip's 32 KB window misses: sorted + xz reaches ≈27 MB.
+
+- **pandas reads/writes both with zero extra dependencies** — compression is
+ inferred from the `.gz`/`.xz` suffix; `lzma` and `gzip` are both stdlib, so
+ this works natively on Windows, macOS, and Linux with no external binary.
+- **MATLAB caveat:** `readtable` reads gzipped TSV after a `gunzip`, but MATLAB
+ has no built-in xz decompressor. The small tables stay MATLAB-native; the
+ large table needs an external `unxz` (or Java/`7-Zip`) before `readtable` on
+ the MATLAB side. The xz file is raven-python's (Python) primary artefact; this
+ trades a little MATLAB convenience on the one big file for a ~3× size cut.
+
+## Options considered
+
+| Format | Python cost | MATLAB cost | Notes |
+| --- | --- | --- | --- |
+| **Gzipped TSV** ✅ | none (stdlib/pandas) | none (`readtable`) | Universal, text, types re-specified on read. Chosen. |
+| Parquet | `pyarrow` or `fastparquet` (~40–60 MB wheel) as a `raven-python[kegg]` extra | needs ≥ R2019a (`parquetread`, native) | Smaller, faster, typed, columnar. Win mainly at scale / repeated random access. |
+| SQLite | none (stdlib `sqlite3`) | **needs Database Toolbox** | Rejected: the MATLAB-side toolbox requirement breaks the "same files, both languages, no extra deps" goal. |
+
+## When to revisit
+
+Reconsider Parquet (or SQLite) if any of these become true:
+
+- The `organism_gene_ko` table grows large enough that load *time* (not just
+ size — the sort+xz change above already addresses on-disk size) becomes a real
+ bottleneck. The remaining inefficiency is that building one species' model
+ still loads all ~9M rows; sorted order makes a `searchsorted`/row-group
+ by-organism read the natural next step before reaching for Parquet.
+- We start doing repeated random-access / columnar reads rather than a single
+ load-once-per-run pattern.
+- A typed, self-describing schema becomes valuable (TSV loses dtypes; they are
+ re-specified on read).
+
+If revisited, prefer **Parquet** over SQLite (no MATLAB toolbox dependency; MATLAB
+reads Parquet natively from R2019a). It could be offered as an optional
+`raven-python[kegg]` extra (pyarrow) alongside the TSV default, rather than replacing
+it — keeping the dependency-free path intact for users who don't opt in.
diff --git a/docs/kegg_hmm_cutoff_calibration.md b/docs/kegg_hmm_cutoff_calibration.md
new file mode 100644
index 0000000..43e3b3e
--- /dev/null
+++ b/docs/kegg_hmm_cutoff_calibration.md
@@ -0,0 +1,203 @@
+# KEGG HMM-query cut-off calibration
+
+This note records the measurements behind the default KO-assignment parameters in
+`reconstruction/kegg/query.py` (`assign_kos` / `get_kegg_model_from_sequences`,
+pipeline step 3b.5) and IMPROVEMENTS **K15**. It is the evidence for moving away
+from RAVEN's `1e-50` cut-off.
+
+## What the parameters do
+
+`assign_kos` turns an `hmmscan` KO×gene E-value matrix into gene→KO assignments
+in three steps:
+
+1. **`cutoff`** — keep hits with `evalue <= cutoff`.
+2. **`min_score_ratio_ko`** — within a KO, drop genes whose
+ `log(evalue)/log(best_evalue_in_KO) < min_score_ratio_ko`.
+3. **`min_score_ratio_g`** — within a gene, drop KOs whose
+ `log(evalue)/log(best_evalue_for_gene) < min_score_ratio_g`.
+
+## Method
+
+- **Data:** KEGG release 118. Libraries: the maintainer-built `prokaryotes.hmm`
+ (831 MB) and `eukaryotes.hmm` (692 MB), 90 %-clustered, FFT-NS-2/PartTree (K12).
+- **Queries:** each organism's full proteome, extracted from `genes.pep`.
+- **Ground truth:** the organism's *real* KEGG gene→KO links, from the
+ `organism_gene_ko` table (restricted, as the table is, to reaction-linked KOs).
+- **Prediction:** `assign_kos` output, with the `organism:` prefix stripped from
+ query gene IDs so they match the bare gene IDs in the ground truth.
+- **Metrics (gene→KO level):** precision = |pred ∩ truth| / |pred|,
+ recall = |pred ∩ truth| / |truth|, F1. Reaction-level: `rxn_rec` = fraction of
+ the organism's true reactions recovered (KO→reaction via `ko_reaction`);
+ `rxn_novel` = predicted reactions **not** in the annotation set.
+- Reproduce with [`scripts/analyze_hmm_cutoffs.py`](../scripts/analyze_hmm_cutoffs.py).
+
+### Important caveat
+
+All four organisms are **present in the libraries' training set**, so their own
+sequences hit their KO profiles strongly and recall is an upper bound. The
+calibration is therefore *relative* (how the parameters trade off, and where
+RAVEN's default sits relative to the signal), not an absolute accuracy estimate.
+A genome genuinely absent from KEGG would be the next validation. Also note that
+`rxn_novel` / "precision < 1" partly reflects **legitimate homology** KEGG never
+annotated for that organism (paralogs, un-curated genes), not pure error — so the
+precision figures are a lower bound on real precision.
+
+## Organisms
+
+| code | organism | library | proteome (seqs) | true gene→KO pairs | true reactions |
+|---|---|---|---|---|---|
+| `sce` | *Saccharomyces cerevisiae* (budding yeast) | euk | 6021 | 841 | 1217 |
+| `cme` | *Cyanidioschyzon merolae* (red alga) | euk | 5010 | 709 | 1157 |
+| `eco` | *Escherichia coli* K-12 MG1655 | prok | 4288 | 1071 | 1548 |
+| `mge` | *Mycoplasmoides genitalium* G37 (minimal genome) | prok | 476 | 110 | 211 |
+
+`sce`/`eco` are model organisms; `cme`/`mge` are lesser-studied, `mge`
+additionally being a small, divergent genome.
+
+## 1. E-value separation (the key result)
+
+`log10(E-value)` percentiles of the best hit per (gene, KO) pair, split by whether
+the pair is in the organism's annotation (**matched**) or not (**novel**). Smaller
+(more negative) = stronger hit.
+
+| organism | group | n | p50 | p90 | p95 | p99 |
+|---|---|---|---|---|---|---|
+| `sce` | matched | 835 | −155 | −75 | −59 | −33 |
+| `sce` | novel | 9467 | −8 | −2 | −0 | 1 |
+| `cme` | matched | 704 | −133 | −63 | −47 | −25 |
+| `cme` | novel | 10170 | −8 | −2 | −2 | 0 |
+| `eco` | matched | 1070 | −142 | −69 | −57 | −36 |
+| `eco` | novel | 27357 | −7 | −2 | −1 | −0 |
+| `mge` | matched | 110 | −100 | −42 | −35 | −15 |
+| `mge` | novel | 1904 | −4 | −2 | −1 | −0 |
+
+**Reading:** matched pairs cluster at E ≈ 1e-100…1e-155; even their weakest 1 %
+sit at 1e-15…1e-36. Novel pairs cluster at ≈1e-8. The two are separated by ~20
+orders of magnitude. RAVEN's **`1e-50` lies inside the *matched* tail** (between
+the matched p90 and p95 for most organisms; past p90 for `mge`), so it discards
+real-but-weakly-scoring annotations while gaining nothing against the (far weaker)
+noise.
+
+## 2. Cut-off sweep
+
+`min_score_ratio_ko = 0.3`, `min_score_ratio_g = 0.8` fixed; gene→KO precision /
+recall / F1 and reaction recovery vs the annotation.
+
+### `sce`
+| cutoff | gKO prec | gKO rec | gKO F1 | rxn rec | rxn novel |
+|---|---|---|---|---|---|
+| 1e-10 | 0.57 | 0.98 | 0.72 | 0.99 | 334 |
+| 1e-20 | 0.65 | 0.98 | 0.78 | 0.97 | 283 |
+| 1e-30 | 0.72 | 0.97 | 0.83 | 0.97 | 216 |
+| 1e-50 | 0.78 | 0.95 | 0.86 | 0.96 | 157 |
+| 1e-70 | 0.81 | 0.91 | 0.86 | 0.91 | 113 |
+| 1e-100 | 0.84 | 0.76 | 0.80 | 0.79 | 68 |
+
+### `cme`
+| cutoff | gKO prec | gKO rec | gKO F1 | rxn rec | rxn novel |
+|---|---|---|---|---|---|
+| 1e-10 | 0.50 | 0.98 | 0.67 | 1.00 | 541 |
+| 1e-20 | 0.57 | 0.98 | 0.72 | 1.00 | 421 |
+| 1e-30 | 0.61 | 0.97 | 0.75 | 0.97 | 367 |
+| 1e-50 | 0.70 | 0.93 | 0.80 | 0.94 | 307 |
+| 1e-70 | 0.75 | 0.85 | 0.80 | 0.87 | 223 |
+| 1e-100 | 0.80 | 0.70 | 0.75 | 0.71 | 136 |
+
+### `eco`
+| cutoff | gKO prec | gKO rec | gKO F1 | rxn rec | rxn novel |
+|---|---|---|---|---|---|
+| 1e-10 | 0.53 | 0.99 | 0.69 | 0.99 | 382 |
+| 1e-20 | 0.57 | 0.99 | 0.73 | 0.99 | 300 |
+| 1e-30 | 0.60 | 0.98 | 0.75 | 0.99 | 268 |
+| 1e-50 | 0.67 | 0.95 | 0.78 | 0.98 | 198 |
+| 1e-70 | 0.73 | 0.88 | 0.80 | 0.93 | 157 |
+| 1e-100 | 0.82 | 0.74 | 0.77 | 0.80 | 96 |
+
+### `mge`
+| cutoff | gKO prec | gKO rec | gKO F1 | rxn rec | rxn novel |
+|---|---|---|---|---|---|
+| 1e-10 | 0.52 | 0.98 | 0.68 | 0.99 | 75 |
+| 1e-20 | 0.62 | 0.96 | 0.75 | 0.98 | 51 |
+| 1e-30 | 0.65 | 0.95 | 0.77 | 0.98 | 39 |
+| 1e-50 | 0.77 | 0.84 | 0.80 | 0.87 | 29 |
+| 1e-70 | 0.85 | 0.73 | 0.78 | 0.73 | 27 |
+| 1e-100 | 0.87 | 0.50 | 0.64 | 0.47 | 21 |
+
+**Reading:** recall is flat-and-high from 1e-10 to ~1e-30, then falls as the
+cut-off eats into the matched tail — gently for model organisms, sharply for the
+divergent `mge` (rxn recall 0.98 → 0.87 from 1e-30 → 1e-50, → 0.47 at 1e-100).
+The recall lost to a stricter cut-off is *not* noise rejection (noise is at 1e-8);
+it is real annotation. `rxn_novel` shrinks with stricter cut-offs because strong
+un-annotated homologs are also removed.
+
+## 3. Score-ratio sweep (`cutoff = 1e-50`)
+
+| organism | ko ratio | g ratio | gKO prec | gKO rec | gKO F1 |
+|---|---|---|---|---|---|
+| `sce` | 0.0 | 0.50 | 0.61 | 0.96 | 0.74 |
+| `sce` | 0.0 | 0.80 | 0.77 | 0.95 | 0.85 |
+| `sce` | 0.0 | 0.95 | 0.84 | 0.93 | 0.88 |
+| `sce` | 0.3 | 0.80 | 0.78 | 0.95 | 0.86 |
+| `sce` | 0.5 | 0.80 | 0.80 | 0.95 | 0.86 |
+| `cme` | 0.0 | 0.50 | 0.53 | 0.94 | 0.68 |
+| `cme` | 0.0 | 0.80 | 0.69 | 0.93 | 0.79 |
+| `cme` | 0.0 | 0.95 | 0.78 | 0.92 | 0.84 |
+| `cme` | 0.3 | 0.80 | 0.70 | 0.93 | 0.80 |
+| `cme` | 0.5 | 0.80 | 0.70 | 0.93 | 0.80 |
+| `eco` | 0.0 | 0.50 | 0.39 | 0.96 | 0.56 |
+| `eco` | 0.0 | 0.80 | 0.66 | 0.95 | 0.78 |
+| `eco` | 0.0 | 0.95 | 0.76 | 0.94 | 0.84 |
+| `eco` | 0.3 | 0.80 | 0.67 | 0.95 | 0.78 |
+| `eco` | 0.5 | 0.80 | 0.69 | 0.95 | 0.80 |
+| `mge` | 0.0 | 0.50 | 0.62 | 0.85 | 0.72 |
+| `mge` | 0.0 | 0.80 | 0.77 | 0.84 | 0.80 |
+| `mge` | 0.0 | 0.95 | 0.82 | 0.81 | 0.81 |
+| `mge` | 0.3 | 0.80 | 0.77 | 0.84 | 0.80 |
+| `mge` | 0.5 | 0.80 | 0.78 | 0.84 | 0.81 |
+
+**Reading:**
+- **`min_score_ratio_ko` is inert** — across all four organisms, varying it
+ 0.0 → 0.3 → 0.5 changes precision/recall by ≤0.02 (mostly 0.00). It is a
+ magic-number knob that does effectively nothing here. (Full 0.0/0.3/0.5 × g-grid
+ in the script output; representative rows shown.)
+- **`min_score_ratio_g` is the real precision lever** — 0.80 → 0.95 lifts
+ precision ~0.07–0.10 for ~0.02 recall loss. 0.50 is clearly too loose.
+
+## 4. Chosen defaults and effect
+
+| parameter | RAVEN / old | new default | rationale |
+|---|---|---|---|
+| `cutoff` | 1e-50 | **1e-30** | recovers the matched tail (esp. divergent genomes); still ~22 orders above the 1e-8 noise floor |
+| `min_score_ratio_g` | 0.8 | **0.9** | the effective precision lever; offsets the looser cut-off |
+| `min_score_ratio_ko` | 0.3 | 0.3 (kept) | empirically inert; retained for RAVEN parity |
+
+Old default `(1e-50, 0.3, 0.8)` vs new default `(1e-30, 0.3, 0.9)`
+(`min_score_ratio_ko` 0.3 ≡ 0.0 here):
+
+| organism | gKO prec | gKO rec | rxn rec | rxn novel |
+|---|---|---|---|---|
+| `sce` | 0.78 → 0.76 | 0.95 → 0.96 | 0.96 → 0.96 | 157 → 137 |
+| `cme` | 0.70 → 0.67 | 0.93 → 0.96 | 0.94 → 0.97 | 307 → 305 |
+| `eco` | 0.67 → 0.67 | 0.95 → 0.97 | 0.98 → 0.99 | 198 → 173 |
+| `mge` | 0.77 → 0.69 | **0.84 → 0.94** | **0.87 → 0.97** | 29 → 35 |
+
+The divergent minimal genome gains ~10 points of recall (the case the sequence
+path exists for); model organisms improve slightly and `eco` emits *fewer*
+unannotated reactions (the tighter gene-ratio prunes spurious multi-KO genes). The
+small precision dip vs annotation is dominated by extra strong homologs, not
+weak-hit noise.
+
+## 5. Whole-model cross-validation (sanity check)
+
+Full reconstruction of *S. cerevisiae* two ways, at the old defaults:
+
+| | annotation path (3b.4) | HMM path (3b.5) |
+|---|---|---|
+| reactions | 1355 | 1461 |
+| metabolites | 1501 | 1567 |
+| genes | 835 | 896 |
+
+Reaction recall 96.3 % (1305/1355 shared, Jaccard 0.86); gene recall 96.6 %
+(807/835 shared, Jaccard 0.87). The annotation path also exercises the new
+`organism_gene_ko.tsv.xz` artefact (K14). `hmmscan` throughput ≈ 0.1 s/query
+against either library on 12 threads (yeast: 6021 queries in 633 s).
diff --git a/docs/maintaining_binaries.md b/docs/maintaining_binaries.md
new file mode 100644
index 0000000..df5b315
--- /dev/null
+++ b/docs/maintaining_binaries.md
@@ -0,0 +1,236 @@
+# Maintaining bundled binaries (BLAST+, DIAMOND, …)
+
+Audience: **raven-python maintainers / the GitHub repo owner.** This explains how
+raven-python ships external command-line tools, how to update their versions, and how
+to build **minimal-footprint** ZIPs to attach to a GitHub release.
+
+> End users never read this. They get a binary automatically via `ensure_binary`,
+> or use their own (system/conda) install. This doc is only for whoever publishes
+> the release assets.
+
+---
+
+## 1. How binary provisioning works
+
+raven-python does **not** vendor binaries in the git repo or on PyPI. Instead:
+
+1. For each tool we publish **version-pinned ZIPs as GitHub release assets**.
+2. A **registry** (`src/raven_python/binaries_registry.json`) maps each *bundle* to its
+ version, the executables it provides, and per-platform `{asset, sha256}`.
+3. At run time `raven_python.binaries.ensure_binary("blastp")` resolves a tool in this
+ order — and only reaches the download as a last resort:
+
+ ```
+ explicit binary= arg → env var (RAVEN_PYTHON_BLASTP / RAVEN_PYTHON_DIAMOND / …)
+ → shutil.which on PATH (system / conda / apt / brew)
+ → ensure_binary: download the pinned ZIP → verify SHA256 → cache → return path
+ → actionable error (with conda / manual instructions)
+ ```
+
+So a pre-installed binary always wins; the bundle is the zero-setup fallback.
+Pinning the version makes reconstruction **reproducible**.
+
+A *bundle* can provide several executables from one download (e.g. the `blast`
+bundle provides both `blastp` and `makeblastdb`), so they are fetched once.
+
+---
+
+## 2. What raven-python actually needs — ship only these
+
+Distribute the **minimum** set of executables. Everything else (other suite
+tools, docs, examples, changelogs) must be excluded.
+
+| Bundle | Executables to include | Everything else |
+|---|---|---|
+| `diamond` | `diamond` | — (it is a single static binary) |
+| `blast` | `blastp`, `makeblastdb` | **drop** `blastn`, `tblastn`, `psiblast`, `rpsblast`, `blast_formatter`, `*_vdb`, the `doc/`, `ChangeLog`, `README`, ~30 other tools |
+
+(Confirmed against RAVEN `getBlast`/`getDiamond`: only `makeblastdb`+`blastp`, and
+`diamond` for its `makedb`/`blastp` subcommands, are ever invoked.)
+
+For BLAST+ this is the big win: the full NCBI suite is ~hundreds of MB; two
+binaries (stripped) are a small fraction.
+
+---
+
+## 3. Asset & ZIP conventions
+
+**Asset filename:** `---.zip`
+
+- `` ∈ `linux`, `macos`, `windows`
+- `` ∈ `x86_64`, `arm64`
+- examples: `diamond-2.1.11-linux-x86_64.zip`, `blast-2.16.0-macos-arm64.zip`
+
+**ZIP layout — flat, executables at the root, plus the upstream licence:**
+
+```
+diamond-2.1.11-linux-x86_64.zip
+├── diamond
+└── LICENSE
+
+blast-2.16.0-linux-x86_64.zip
+├── blastp
+├── makeblastdb
+└── LICENSE
+```
+
+No nested `bin/`, no extra files. `ensure_binary` extracts the ZIP into the cache
+and expects the executable at the top level.
+
+---
+
+## 4. Step-by-step: add or update a version
+
+Example: bump DIAMOND to a new version for Linux x86-64. Repeat per `(os, arch)`.
+
+1. **Download the official upstream build** (never rebuild from source unless you
+ must):
+ - DIAMOND →
+ (`diamond-linux64.tar.gz`, `diamond-macos.tar.gz`)
+ - BLAST+ → or a
+ pinned version dir (`ncbi-blast-+-x64-linux.tar.gz`,
+ `-x64-macosx.tar.gz`, `-aarch64-linux.tar.gz`, `-x64-win64.tar.gz`).
+ - Record the upstream URL **and** its published checksum for provenance.
+2. **Extract only the needed executables** (see §2) to a clean staging dir.
+3. **Strip debug symbols** to shrink (skip on Windows / signed macOS builds):
+ ```bash
+ strip diamond # or: strip blastp makeblastdb
+ ```
+4. **Smoke-test the stripped binaries in a clean shell** (no other tools on PATH):
+ ```bash
+ ./diamond --version
+ ./blastp -version && ./makeblastdb -version
+ ```
+ If they fail for a missing shared library, add that `.so`/`.dylib` to the ZIP
+ (rare — NCBI/DIAMOND release builds are largely self-contained).
+5. **Add the upstream licence file** as `LICENSE` (see §6).
+6. **Zip with max compression, flat layout:**
+ ```bash
+ zip -9 -j diamond-2.1.11-linux-x86_64.zip diamond LICENSE
+ # -j junks paths so entries sit at the ZIP root
+ ```
+7. **Compute the SHA256:**
+ ```bash
+ sha256sum diamond-2.1.11-linux-x86_64.zip # shasum -a 256 on macOS
+ ```
+8. **Attach the ZIP to a raven-python GitHub release** (a release tagged for the binary
+ set, e.g. `binaries-2024.06`, keeps them independent of code releases).
+9. **Update the registry** `src/raven_python/binaries_registry.json` — bump `version`
+ and set the per-platform `asset` + `sha256`:
+ ```json
+ {
+ "diamond": {
+ "version": "2.1.11",
+ "provides": ["diamond"],
+ "platforms": {
+ "linux-x86_64": {
+ "asset": "diamond-2.1.11-linux-x86_64.zip",
+ "url": "https://github.com/SysBioChalmers/raven-python/releases/download/binaries-2024.06/diamond-2.1.11-linux-x86_64.zip",
+ "sha256": ""
+ }
+ }
+ },
+ "blast": {
+ "version": "2.16.0",
+ "provides": ["blastp", "makeblastdb"],
+ "platforms": { "linux-x86_64": { "asset": "...", "url": "...", "sha256": "..." } }
+ }
+ }
+ ```
+10. **Commit the registry change**, run the homology tests, and (if you have the
+ binary) confirm `ensure_binary("diamond", version="2.1.11")` downloads,
+ verifies, and runs.
+
+---
+
+## 5. Keeping the footprint minimal — checklist
+
+- ✅ Only the executables in §2 (for BLAST+, exactly `blastp` + `makeblastdb`).
+- ✅ `strip` the binaries (often halves their size).
+- ✅ `zip -9 -j` (max compression, flat — no `bin/`, no folders).
+- ✅ Exactly one extra file: `LICENSE`.
+- ❌ No docs, examples, `ChangeLog`, `README`, man pages, test data, or sibling tools.
+- ❌ No `.dSYM`/debug bundles; no duplicate static `.a` libraries.
+- ➕ Only add a shared library if step-4 testing proves it is required.
+
+---
+
+## 6. Platform / architecture matrix & licensing
+
+**Coverage = what you build.** Start with `linux-x86_64` (CI default), then add
+`macos-arm64`, `macos-x86_64`, `linux-arm64`, `windows-x86_64` as capacity allows.
+For any `(os, arch)` **not** in the registry, `ensure_binary` raises an actionable
+error pointing to conda (`conda install -c bioconda diamond blast`) or a manual
+install — that is the documented fallback, not a failure to fix urgently.
+
+**Licensing (must comply when redistributing):**
+
+- **BLAST+** — produced by NCBI (US Government); **public domain**, free to
+ redistribute. Include NCBI's `LICENSE` for courtesy/provenance.
+- **DIAMOND** — **GPLv3**. Redistribution is allowed; you **must** include the
+ GPLv3 licence text in the ZIP and keep the binary unmodified (or offer source).
+- **HMMER** (future) — BSD-3-Clause; include its `LICENSE`.
+
+Always ship the upstream licence in the ZIP, and keep a `BINARIES_PROVENANCE.md`
+(or a note in the release body) recording, per asset: upstream URL, upstream
+version, upstream checksum, and the SHA256 you published.
+
+### Native OS support per tool
+
+raven-python invokes each tool through `subprocess.run([resolved_path, …])` — that
+call is itself cross-platform, so the real constraint is whether a given tool has
+a binary that runs natively on each OS. It varies:
+
+| Tool | Linux | macOS (incl. arm64) | Windows (native) |
+|---|---|---|---|
+| BLAST+ (`blastp`, `makeblastdb`) | ✅ | ✅ | ✅ (NCBI ships Windows builds) |
+| DIAMOND | ✅ | ✅ | ⚠️ native build exists but Linux-first |
+| HMMER (`hmmbuild`/`hmmpress`/`hmmsearch`/`hmmscan`) | ✅ | ✅ | ❌ no official native build |
+| MAFFT | ✅ | ✅ | ⚠️ Windows package is a wrapper |
+| CD-HIT | ✅ | ✅ | ❌ no Windows build exists |
+
+Implications:
+
+- **Linux / macOS** — everything works. `conda install -c bioconda hmmer mafft
+ cd-hit blast diamond`, or point the `RAVEN_PYTHON_*` env vars at your installs.
+- **Native Windows** — the homology track (BLAST+/DIAMOND) works, but the **KEGG
+ HMM build (3b.3) and HMM query (3b.5) do not**: HMMER and CD-HIT have no Windows
+ binaries, and bioconda has no Windows packages for any of them. Bundling can't
+ fix this — there is no binary to bundle.
+- **Windows users should run raven-python inside WSL2** (or a Linux container), where
+ every tool is native Linux. raven-python does **not** replicate RAVEN's
+ `getWSLpath`/`wsl …` path translation: it calls the resolved binary directly, so
+ mixing native-Windows Python with WSL binaries is unsupported — keep the whole
+ stack inside WSL2.
+- The common end-user paths — homology reconstruction and the KEGG *species* model
+ (3b.4) — need no HMMER/MAFFT/CD-HIT, so they are fully cross-platform.
+
+---
+
+## 7. Emitting the registry entry
+
+After building the per-platform ZIPs (named `---.zip`)
+and uploading them to the release, generate the `_REGISTRY` entry — checksums and
+URLs — with [`scripts/make_registry_snippet.py`](../scripts/README.md):
+
+```bash
+python scripts/make_registry_snippet.py binary --bundle blast --version 2.16.0 \
+ --provides blastp makeblastdb --dir zips \
+ --base-url https://github.com/ORG/raven-python/releases/download/blast-2.16.0
+```
+
+It prints the ready-to-paste `_REGISTRY["blast"]` block; its SHA256 helper is the
+same one `ensure_binary` verifies with, so the checksums always match. (Producing
+the minimal ZIPs themselves — download upstream, `strip`, `zip -9 -j`, add
+`LICENSE` per §3–§6 — is still a manual/per-tool step.)
+
+---
+
+## 8. Adding a new tool later (e.g. HMMER for KEGG reconstruction)
+
+1. Decide the **minimal executable set** (e.g. HMMER → `hmmsearch`, `hmmscan`,
+ maybe `hmmbuild`/`hmmpress`).
+2. Add a bundle entry to the registry with `provides` listing those executables.
+3. Build/attach ZIPs per §3–§4; include the tool's licence (§6).
+4. The wrappers call `ensure_binary("hmmsearch", …)` with the same resolution
+ order — no new provisioning code needed.
diff --git a/docs/maintaining_kegg_data.md b/docs/maintaining_kegg_data.md
new file mode 100644
index 0000000..f53d0da
--- /dev/null
+++ b/docs/maintaining_kegg_data.md
@@ -0,0 +1,157 @@
+# Maintaining the KEGG data artefacts
+
+This guide is for the **package maintainer** who rebuilds raven-python's KEGG
+artefacts once per KEGG release. End users never do this — they download the
+published, version-pinned artefacts. The build has two implemented steps so far:
+**3b.1 download** (`reconstruction/kegg/download.py`) and **3b.2 parse**
+(`reconstruction/kegg/parse.py`); see PLAN.md §2.3b for the full pipeline.
+
+## Prerequisites
+
+### A paid KEGG FTP subscription
+The bulk KEGG dump is licensed. You need an active subscription to
+`ftp.kegg.net`, which gives you a **username and password**.
+
+### Credentials in `~/.netrc`
+The download reads your KEGG username and password from a `~/.netrc` file — it
+never takes them on the command line, so they stay out of your shell history and
+out of `ps` output. Create the file (readable only by you) and add a `machine`
+line for the KEGG host:
+
+```bash
+touch ~/.netrc && chmod 600 ~/.netrc
+```
+
+Then add this single line to `~/.netrc`, substituting your subscription
+credentials:
+
+```
+machine ftp.kegg.net login YOUR_KEGG_USER password YOUR_KEGG_PASSWORD
+```
+
+Notes:
+- The host **must be `ftp.kegg.net`** — that is the machine name the downloader
+ looks up. A `machine` line for any other host is ignored.
+- The file **must be mode `600`** (owner read/write only). Python's `netrc`
+ parser refuses a `.netrc` that other users can read.
+- `~/.netrc` is the same convention `curl`, `wget` and `git` use, so if you
+ already have one, just add the `ftp.kegg.net` line to it.
+
+If you keep secrets somewhere other than `$HOME`, point the downloader at a
+different file with `netrc_path=...` (see below); the format is identical.
+
+## Step 3b.1 — download and arrange the dump
+
+With `~/.netrc` in place, no credentials need to be passed in code:
+
+```python
+from raven_python.reconstruction.kegg import download_kegg_dump
+
+# Reads ~/.netrc, fetches the KEGG archives, extracts and arranges them.
+download_kegg_dump("keggdb")
+```
+
+This fetches the reaction / compound / glycan / ko archives, the eukaryote and
+prokaryote proteomes, and the taxonomy file; extracts them; and arranges the
+flat layout the parser expects (`reaction`, `reaction.lst`,
+`reaction_mapformula.lst`, `compound` = compound + glycan, `compound.inchi`,
+`ko`, `genes.pep` = both proteomes, `taxonomy`).
+
+Credential alternatives:
+
+```python
+# A .netrc in a non-default location:
+download_kegg_dump("keggdb", netrc_path="/run/secrets/kegg_netrc")
+
+# Pass credentials explicitly (only when they come from a secret manager at
+# runtime — never hardcode literals in committed code):
+download_kegg_dump("keggdb", auth=("YOUR_KEGG_USER", "YOUR_KEGG_PASSWORD"))
+```
+
+Already-downloaded files are skipped; pass `force=True` to re-fetch (for a new
+KEGG release).
+
+## Step 3b.2 — parse into the published artefacts
+
+```python
+from raven_python.reconstruction.kegg import parse_kegg_dump
+
+parse_kegg_dump("keggdb", "artefacts")
+```
+
+This writes the gene-free reference model (`reference_model.yml.gz`, gzipped
+RAVEN/cobra YAML) and the relational tables as gzipped TSV. See
+[kegg_data_format.md](kegg_data_format.md) for what those tables contain and the
+format rationale.
+
+## Step 3b.3 — build the HMM libraries
+
+Build the per-domain profile-HMM libraries that the de-novo query path (3b.5)
+searches. This needs **HMMER** (`hmmbuild`, `hmmpress`), **MAFFT**, and
+**CD-HIT** on `PATH` (or set `RAVEN_PYTHON_HMMBUILD` / `RAVEN_PYTHON_MAFFT` /
+`RAVEN_PYTHON_CDHIT`, etc.); install e.g. `conda install -c bioconda hmmer mafft cd-hit`.
+
+> **OS note:** these three tools run on Linux and macOS but **not native
+> Windows** — on Windows, run this step inside WSL2. See the native-OS-support
+> matrix in [maintaining_binaries.md](maintaining_binaries.md#native-os-support-per-tool).
+
+```python
+from raven_python.reconstruction.kegg import build_hmm_library, read_kegg_table
+
+organism_gene_ko = read_kegg_table("artefacts/organism_gene_ko.tsv.xz")
+for domain in ("prokaryotes", "eukaryotes"):
+ build_hmm_library(
+ organism_gene_ko,
+ "keggdb/genes.pep", # proteomes from 3b.1
+ "keggdb/taxonomy", # domain split, from 3b.1
+ f"hmms/{domain}",
+ domain=domain,
+ )
+```
+
+For each KO in the domain it gathers the member sequences, dereplicates with
+CD-HIT (~90 % identity), aligns with MAFFT, trains a profile with `hmmbuild`, and
+finally concatenates and `hmmpress`-es them into a single `library.hmm` for fast
+`hmmscan` querying. This is the slowest step (hours, once per KEGG release); it
+skips KOs whose `.hmm` already exists, so it is resumable. The resulting
+libraries are published as version-pinned artefacts alongside the reference model
+and tables.
+
+## Building and publishing in one go
+
+[`scripts/build_kegg_artefacts.py`](../scripts/README.md) runs 3b.2 (+ 3b.3 with
+`--hmms`) and lays the output out as publishable assets (`.hmm` named for
+`ensure_kegg_hmm_library`):
+
+```bash
+python scripts/build_kegg_artefacts.py --keggdb keggdb --out artefacts --hmms --threads 8
+```
+
+Upload the contents of `artefacts/` to a release, then emit the registry entry for
+`raven_python.data._DATA_REGISTRY` with [`scripts/make_registry_snippet.py`](../scripts/README.md):
+
+```bash
+python scripts/make_registry_snippet.py data --dataset kegg --version kegg116 \
+ --dir artefacts --base-url https://github.com/ORG/raven-python/releases/download/kegg-data-kegg116
+```
+
+Paste the printed block into `_DATA_REGISTRY`; from then on `ensure_data` fetches
+and verifies the artefacts for end users automatically.
+
+## End-user paths (3b.4 / 3b.5)
+
+End users do **not** run the steps above — the published artefacts are fetched and
+cached automatically by `ensure_data` (`raven_python.data`) under
+`~/.cache/raven-python/data/kegg-/` on first use, so the entry points below
+can be called with no local paths at all (pass an explicit `artefact_dir=`/
+`library=` to use your own build instead). Two runtime entry points build a draft
+model from the artefacts:
+
+- **3b.4 — species in KEGG** (`get_kegg_model_for_organism_from_artefacts`): no
+ binaries needed; uses the organism's KEGG gene↔KO annotations. Fully
+ cross-platform. `organism_id="prokaryotes"`/`"eukaryotes"` builds a whole-domain
+ model (pass `taxonomy=`).
+- **3b.5 — organism not in KEGG** (`get_kegg_model_from_sequences`): `hmmscan`-es a
+ proteome FASTA against the pressed `library.hmm`, so it needs **HMMER**
+ (`hmmscan`) — Linux/macOS or WSL2 (see the OS matrix). Tune assignment with
+ `cutoff`, `min_score_ratio_ko`, `min_score_ratio_g`.
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..faeeb1f
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,84 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "raven-python"
+version = "0.0.1"
+description = "Reconstruction, Analysis and Visualization of Metabolic Networks in Python, a port of the RAVEN Toolbox built on cobrapy"
+readme = "README.md"
+requires-python = ">=3.11"
+license = { text = "GPL-3.0-or-later" }
+authors = [
+ { name = "Eduard Kerkhoven", email = "eduardk@chalmers.se" },
+]
+keywords = [
+ "genome-scale-model",
+ "metabolic-model",
+ "reconstruction",
+ "raven",
+ "cobra",
+ "systems-biology",
+ "constraint-based-modeling",
+ "kegg",
+ "metacyc",
+ "tinit",
+]
+classifiers = [
+ "Development Status :: 2 - Pre-Alpha",
+ "Intended Audience :: Science/Research",
+ "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
+ "Operating System :: OS Independent",
+ "Programming Language :: Python :: 3",
+ "Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12",
+ "Programming Language :: Python :: 3.13",
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
+]
+dependencies = [
+ "cobra>=0.29",
+ "numpy>=1.21",
+ "pandas>=2",
+ "scipy>=1.10",
+ "ruamel.yaml>=0.17",
+ "requests>=2.28",
+ "tqdm>=4.65",
+]
+
+[project.optional-dependencies]
+dev = [
+ "pytest>=7",
+ "pytest-cov",
+ "ruff>=0.4",
+]
+excel = [
+ "openpyxl>=3.1",
+]
+plotting = [
+ "matplotlib>=3.5",
+]
+
+[project.urls]
+Homepage = "https://github.com/SysBioChalmers/raven-python"
+Source = "https://github.com/SysBioChalmers/raven-python"
+Issues = "https://github.com/SysBioChalmers/raven-python/issues"
+"RAVEN MATLAB" = "https://github.com/SysBioChalmers/RAVEN"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/raven_python"]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+
+[tool.ruff]
+line-length = 100
+target-version = "py311"
+
+[tool.ruff.lint]
+select = ["E", "F", "W", "I", "UP", "B"]
+ignore = [
+ "E501", # line length handled by the formatter
+]
+
+[tool.ruff.lint.per-file-ignores]
+"tests/*" = ["E402"]
diff --git a/scripts/README.md b/scripts/README.md
new file mode 100644
index 0000000..67a3b36
--- /dev/null
+++ b/scripts/README.md
@@ -0,0 +1,35 @@
+# Maintainer scripts
+
+Release-time tooling. Not part of the installed package — run them from a checkout
+with raven-python installed (`pip install -e .`). End users never need these.
+
+## `build_kegg_artefacts.py`
+
+Build the publishable KEGG artefact set from an arranged KEGG dump (see
+`download_kegg_dump`): the gzipped-YAML reference model, the gzipped-TSV tables,
+and (with `--hmms`) the per-domain pressed HMM libraries. Output is laid out ready
+to upload as release assets. See [docs/maintaining_kegg_data.md](../docs/maintaining_kegg_data.md).
+
+```bash
+python scripts/build_kegg_artefacts.py --keggdb keggdb --out artefacts # tables + model
+python scripts/build_kegg_artefacts.py --keggdb keggdb --out artefacts --hmms --threads 8
+```
+
+## `make_registry_snippet.py`
+
+After uploading the files to a release, compute their SHA256 and print the entry
+to merge into the runtime registry — `raven_python.data._DATA_REGISTRY` (data) or
+`raven_python.binaries._REGISTRY` (binary ZIP bundles). The checksum helper is shared
+with the resolvers, so published checksums always match what `ensure_data` /
+`ensure_binary` verify.
+
+```bash
+# Data artefacts:
+python scripts/make_registry_snippet.py data --dataset kegg --version kegg116 \
+ --dir artefacts --base-url https://github.com/ORG/raven-python/releases/download/kegg-data-kegg116
+
+# Binary bundle (ZIPs named ---.zip):
+python scripts/make_registry_snippet.py binary --bundle blast --version 2.16.0 \
+ --provides blastp makeblastdb --dir zips \
+ --base-url https://github.com/ORG/raven-python/releases/download/blast-2.16.0
+```
diff --git a/scripts/analyze_hmm_cutoffs.py b/scripts/analyze_hmm_cutoffs.py
new file mode 100644
index 0000000..654fc02
--- /dev/null
+++ b/scripts/analyze_hmm_cutoffs.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+"""Cut-off sensitivity for the KEGG HMM query path (step 3b.5).
+
+Cross-validates ``assign_kos`` against an organism's *real* KEGG gene→KO
+annotation (from the ``organism_gene_ko`` table) and sweeps the E-value cut-off
+and the two score-ratio filters. Produces the tables in
+``docs/kegg_hmm_cutoff_calibration.md``.
+
+Usage
+-----
+ python scripts/analyze_hmm_cutoffs.py \
+ --artefacts ~/keggdb_artefacts \
+ --proteome /path/to/org.pep \
+ --org sce --library ~/keggdb_artefacts/eukaryotes.hmm
+
+``--proteome`` is the organism's protein FASTA (headers ``>org:gene ...``, e.g.
+extracted from KEGG ``genes.pep``). ``--tblout`` may be given instead of
+``--library`` to reuse a cached ``hmmscan --tblout`` file. Requires ``hmmscan``
+on PATH or via ``RAVEN_PYTHON_HMMER`` when ``--library`` is used.
+
+Caveat: organisms present in the library's training set give an upper bound on
+recall; the comparison is relative (see the doc).
+"""
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+
+from raven_python.reconstruction.kegg.parse import read_kegg_table
+from raven_python.reconstruction.kegg.query import (
+ assign_kos,
+ parse_hmmscan_tblout,
+ run_hmmscan,
+)
+
+CUTOFFS = (1e-10, 1e-20, 1e-30, 1e-50, 1e-70, 1e-100)
+KO_RATIOS = (0.0, 0.3, 0.5)
+G_RATIOS = (0.5, 0.8, 0.95)
+
+
+def load_ko2rxn(artefacts: Path) -> dict[str, set[str]]:
+ tbl = read_kegg_table(artefacts / "ko_reaction.tsv.gz")
+ ko2rxn: dict[str, set[str]] = {}
+ for ko, rxn in zip(tbl["ko"], tbl["reaction"], strict=True):
+ ko2rxn.setdefault(ko, set()).add(rxn)
+ return ko2rxn
+
+
+def ground_truth(artefacts: Path, org: str, ko2rxn) -> tuple[set, set]:
+ ogk = read_kegg_table(artefacts / "organism_gene_ko.tsv.xz")
+ rows = ogk[ogk["organism"].str.lower() == org]
+ pairs = set(zip(rows["gene"], rows["ko"], strict=True))
+ rxns = {r for _, ko in pairs for r in ko2rxn.get(ko, ())}
+ return pairs, rxns
+
+
+def predicted_pairs(hits: pd.DataFrame, **kw) -> set:
+ out = set()
+ for ko, genes in assign_kos(hits, **kw).items():
+ for g in genes:
+ out.add((g.split(":", 1)[1] if ":" in g else g, ko))
+ return out
+
+
+def prf(pred: set, truth: set) -> tuple[float, float, float]:
+ tp = len(pred & truth)
+ rec = tp / len(truth) if truth else 0.0
+ prec = tp / len(pred) if pred else 0.0
+ f1 = 2 * prec * rec / (prec + rec) if prec + rec else 0.0
+ return prec, rec, f1
+
+
+def main(argv=None) -> None:
+ ap = argparse.ArgumentParser(description=__doc__)
+ ap.add_argument("--artefacts", type=Path, required=True)
+ ap.add_argument("--org", required=True, help="KEGG organism code, e.g. sce")
+ ap.add_argument("--proteome", type=Path, help="protein FASTA (headers >org:gene)")
+ ap.add_argument("--library", type=Path, help="pressed HMM library for hmmscan")
+ ap.add_argument("--tblout", type=Path, help="cached hmmscan --tblout (skip hmmscan)")
+ ap.add_argument("--threads", type=int, default=4)
+ args = ap.parse_args(argv)
+
+ if args.tblout:
+ text = args.tblout.read_text()
+ elif args.library and args.proteome:
+ text = run_hmmscan(args.proteome, args.library, threads=args.threads)
+ else:
+ ap.error("give --tblout, or --library and --proteome")
+
+ org = args.org.lower()
+ hits = parse_hmmscan_tblout(text)
+ hits = hits[hits["gene"].str.startswith(f"{org}:")].reset_index(drop=True)
+ ko2rxn = load_ko2rxn(args.artefacts)
+ gt_pairs, gt_rxns = ground_truth(args.artefacts, org, ko2rxn)
+
+ print(f"\n{'='*70}\n{org}: {hits['gene'].nunique()} query genes with hits, "
+ f"{len(gt_pairs)} true gene->KO pairs, {len(gt_rxns)} true reactions\n{'='*70}")
+
+ best: dict[tuple, float] = {}
+ for ko, gene, e in zip(hits["ko"], hits["gene"], hits["evalue"], strict=True):
+ key = (gene.split(":", 1)[1], ko)
+ if key not in best or e < best[key]:
+ best[key] = e
+ matched = np.array([e for k, e in best.items() if k in gt_pairs])
+ novel = np.array([e for k, e in best.items() if k not in gt_pairs])
+
+ def logq(arr, q):
+ if not len(arr):
+ return float("nan")
+ v = np.quantile(arr, q)
+ return np.log10(v) if v > 0 else -300.0
+
+ print("\nlog10(E-value) percentiles [matched=in annotation, novel=not]:")
+ print(f" {'group':<8}{'n':>7}{'p50':>8}{'p90':>8}{'p95':>8}{'p99':>8}")
+ for name, arr in (("matched", matched), ("novel", novel)):
+ print(f" {name:<8}{len(arr):>7}{logq(arr,.5):>8.0f}{logq(arr,.9):>8.0f}"
+ f"{logq(arr,.95):>8.0f}{logq(arr,.99):>8.0f}")
+
+ print("\ncutoff sweep (min_score_ratio_ko=0.3, min_score_ratio_g=0.8):")
+ print(f" {'cutoff':>8}{'gKO_prec':>9}{'gKO_rec':>8}{'gKO_F1':>8}{'rxn_rec':>9}{'rxn_novel':>10}")
+ for cutoff in CUTOFFS:
+ pred = predicted_pairs(hits, cutoff=cutoff)
+ prec, rec, f1 = prf(pred, gt_pairs)
+ pred_rxns = {r for _, ko in pred for r in ko2rxn.get(ko, ())}
+ rrec = len(pred_rxns & gt_rxns) / len(gt_rxns) if gt_rxns else 0.0
+ print(f" {cutoff:>8.0e}{prec:>9.2f}{rec:>8.2f}{f1:>8.2f}{rrec:>9.2f}"
+ f"{len(pred_rxns - gt_rxns):>10}")
+
+ print("\nratio sweep (cutoff=1e-50):")
+ print(f" {'ko_ratio':>9}{'g_ratio':>8}{'gKO_prec':>9}{'gKO_rec':>8}{'gKO_F1':>8}")
+ for rko in KO_RATIOS:
+ for rg in G_RATIOS:
+ pred = predicted_pairs(hits, cutoff=1e-50,
+ min_score_ratio_ko=rko, min_score_ratio_g=rg)
+ prec, rec, f1 = prf(pred, gt_pairs)
+ print(f" {rko:>9.1f}{rg:>8.2f}{prec:>9.2f}{rec:>8.2f}{f1:>8.2f}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/analyze_init_params.py b/scripts/analyze_init_params.py
new file mode 100644
index 0000000..1059006
--- /dev/null
+++ b/scripts/analyze_init_params.py
@@ -0,0 +1,367 @@
+#!/usr/bin/env python3
+"""Parameter calibration for (f)tINIT — intrinsic speed/quality sweeps (Phase 4d.7).
+
+Genome-scale benchmark that sweeps the MILP/conditioning parameters of raven_python's
+:func:`raven_python.init.run_ftinit`, :func:`raven_python.init.ftinit`, :func:`run_init`, and
+:func:`prep_init_model` and records, for each value, the *intrinsic* trade-off: wall-clock
+solve time, the MILP objective, and how far the result drifts from the tightest-setting
+("reference") run — both in objective (relative gap) and in the **kept-reaction set**
+(Jaccard). No external (RAVEN) reference is used: the question answered here is "what is
+the loosest / cheapest setting that still reproduces the tight-setting solution?".
+
+Why reaction-set drift matters: a MIP gap g only guarantees the *objective* is within g of
+optimal; the *which-reactions* answer can jump between alternate optima well before the
+objective moves. For a model-extraction tool the reaction set is the product, so we track
+its stability explicitly.
+
+Sweeps (select with ``--sweeps``; each is resumable — results are pickled per config and a
+re-run skips finished ones):
+
+* ``ftinit_milp`` — single staged-MILP step (step 0 of series '1+1') on the merged model.
+ Cheap (~30-200 s each); the core sweep for ``mip_gap``/``big_m``/``force_on``.
+* ``prep_scale`` — rescaleModelForINIT on/off and its ``max_stoich_diff``, fed into the
+ same step-0 MILP. Shows why scaling is needed for a fixed big-M.
+* ``tinit`` — full ``get_init_model`` (classic INIT). Sweeps ``mip_gap``/``eps``/
+ ``prod_weight``/``big_m``. Expensive — uses a tight ``time_limit``.
+* ``ftinit_full`` — the whole ``ftinit`` pipeline (both steps + gap-fill). Sweeps
+ ``mip_gap``/``big_m``. Expensive (~200 s+/config).
+
+Usage
+-----
+ python scripts/analyze_init_params.py \
+ --work ~/hgem_compare --cell HCT116 --sweeps ftinit_milp,prep_scale
+
+``--work`` holds ``raven_refModel.xml`` and the Human-GEM-derived spont/custom inputs
+(see the Human-GEM validation run). Requires a MILP solver (Gurobi/HiGHS) on the cobra
+config. Produces a results pickle and prints a table per sweep; feed the tables into
+docs/init_param_calibration.md.
+"""
+from __future__ import annotations
+
+import argparse
+import pickle
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+
+import cobra
+
+from raven_python.init import (
+ ftinit,
+ gene_scores_from_expression,
+ get_init_model,
+ prep_init_model,
+ score_reactions_from_genes,
+)
+from raven_python.init.ftinit import run_ftinit
+from raven_python.init.merge import group_rxn_scores
+from raven_python.init.prep import rescale_for_init
+from raven_python.init.steps import get_init_steps
+
+# Sweep grids (first value of each tolerance sweep is the tight "reference").
+MIP_GAPS = (0.0002, 0.001, 0.003, 0.01, 0.03, 0.1)
+BIG_MS = (100.0, 50.0, 25.0, 250.0, 1000.0)
+FORCE_ONS = (0.1, 0.02, 0.05, 0.2, 0.5)
+MAX_STOICH = (25.0, 10.0, 50.0, 100.0)
+EPS_VALS = (1.0, 0.1, 0.5, 2.0)
+PROD_WEIGHTS = (0.5, 0.0, 0.25, 1.0)
+
+# "Recommended = cheapest config within these of the reference" thresholds.
+TOL_OBJ = 0.005 # relative objective gap
+TOL_JAC = 0.99 # kept-reaction-set Jaccard
+
+
+@dataclass
+class Result:
+ """One config's outcome (reaction set stored sorted for pickling/Jaccard)."""
+
+ label: str
+ seconds: float
+ status: str
+ objective: float
+ n_kept: int
+ reactions: list[str] = field(default_factory=list)
+ rel_obj_gap: float | None = None # vs the sweep reference
+ jaccard: float | None = None # vs the sweep reference
+
+
+def _jaccard(a: set[str], b: set[str]) -> float:
+ return len(a & b) / len(a | b) if (a or b) else 1.0
+
+
+def _load_inputs(work: Path, human_gem: Path, cell: str):
+ ref = cobra.io.read_sbml_model(str(work / "raven_refModel.xml"))
+ ref.solver = cobra.Configuration().solver
+ spont = []
+ with open(human_gem / "model" / "reactions.tsv") as f:
+ hdr = f.readline().rstrip("\n").split("\t")
+ ci = hdr.index("spontaneous")
+ for line in f:
+ p = line.rstrip("\n").split("\t")
+ if p[ci] == "1":
+ spont.append(p[0])
+ protein = [f"MAR0{n}" for n in (5155, 5156, 5161, 5167, 5168, 5169, 5170, 5171, 5172,
+ 5174, 5260, 5262, 5264, 5266, 5267, 5268, 5269, 5270, 5271, 5273, 5275, 5277,
+ 5279, 5281, 5283, 5291)] + ["MAR09817", "MAR09818"]
+ pool = ["MAR00011", "MAR00012", "MAR00477", "MAR05233", "MAR05234", "MAR05238",
+ "MAR05239", "MAR05243", "MAR05244", "MAR05247", "MAR09022", "MAR00015",
+ "MAR00016", "MAR00017", "MAR10033", "MAR10035", "MAR10036", "MAR10037",
+ "MAR10038", "MAR10062", "MAR10063", "MAR10064", "MAR10065", "MAR13082"]
+ custom = sorted(set(protein) | set(pool))
+ expr: dict[str, float] = {}
+ with open(human_gem / "data" / "datasets" / "Hart2015_RNAseq.txt") as f:
+ h = f.readline().rstrip("\n").split("\t")
+ c = h.index(cell)
+ for line in f:
+ p = line.rstrip("\n").split("\t")
+ expr[p[0]] = float(p[c])
+ gene_scores = gene_scores_from_expression(expr, 1.0)
+ rxn_scores = score_reactions_from_genes(ref, gene_scores)
+ return ref, spont, custom, gene_scores, rxn_scores
+
+
+def _step0(prep, rxn_scores):
+ """The scores/flags for step 0 of series '1+1' (the cheap single-MILP probe)."""
+ step = get_init_steps("1+1")[0]
+ to_zero = prep.masks.ignored(step.ignore_mask)
+ scores = group_rxn_scores(prep.min_model, rxn_scores, prep.orig_rxn_ids,
+ prep.group_ids, to_zero)
+ return step, scores
+
+
+def _run_step0(min_model, scores, prep, step, **kw) -> Result:
+ t = time.time()
+ res = run_ftinit(min_model, scores, essential_rxns=set(prep.essential_rxns),
+ allow_excretion=step.allow_met_secr, rem_pos_rev=step.pos_rev_off,
+ ignore_mets=step.mets_to_ignore, **kw)
+ return Result(label="", seconds=time.time() - t, status="ok",
+ objective=res.objective, n_kept=len(res.on_reactions),
+ reactions=sorted(res.on_reactions))
+
+
+def _finalize(results: list[Result]) -> None:
+ """Fill rel_obj_gap / jaccard against the first result (the reference)."""
+ ref = results[0]
+ ref_set = set(ref.reactions)
+ for r in results:
+ r.rel_obj_gap = (ref.objective - r.objective) / abs(ref.objective) if ref.objective else 0.0
+ r.jaccard = _jaccard(set(r.reactions), ref_set)
+
+
+def _recommend(results: list[Result]) -> str:
+ """Cheapest config (after the reference) within both tolerances; '-' if none."""
+ ok = [r for r in results[1:]
+ if r.status == "ok" and abs(r.rel_obj_gap or 1) <= TOL_OBJ and (r.jaccard or 0) >= TOL_JAC]
+ return min(ok, key=lambda r: r.seconds).label if ok else "-"
+
+
+def _print_table(title: str, results: list[Result], note: str = "") -> list[str]:
+ lines = [f"### {title}", ""]
+ if note:
+ lines += [note, ""]
+ lines.append("| config | time (s) | status | objective | n_kept | rel.obj.gap | Jaccard vs ref |")
+ lines.append("|---|--:|---|--:|--:|--:|--:|")
+ for r in results:
+ gap = "ref" if r is results[0] else (f"{r.rel_obj_gap:+.4f}" if r.rel_obj_gap is not None else "")
+ jac = "ref" if r is results[0] else (f"{r.jaccard:.4f}" if r.jaccard is not None else "")
+ lines.append(f"| {r.label} | {r.seconds:.0f} | {r.status} | {r.objective:.1f} | "
+ f"{r.n_kept} | {gap} | {jac} |")
+ rec = _recommend(results)
+ lines += ["", f"Cheapest config within obj≤{TOL_OBJ:.1%} & Jaccard≥{TOL_JAC} of ref: **{rec}**", ""]
+ for ln in lines:
+ print(ln)
+ return lines
+
+
+# --------------------------------------------------------------------------- sweeps
+
+def sweep_ftinit_milp(prep, rxn_scores, store, save) -> list:
+ step, scores = _step0(prep, rxn_scores)
+ mm = prep.min_model
+ doc: list[str] = []
+
+ def cfg(label, **kw):
+ key = ("ftinit_milp", label)
+ if key not in store:
+ print(f"[ftinit_milp] {label} ...", flush=True)
+ r = _run_step0(mm, scores, prep, step, **kw)
+ r.label = label
+ store[key] = r
+ save()
+ return store[key]
+
+ # mip_gap sweep (big_m=100, force_on=0.1)
+ res = [cfg(f"gap={g}", mip_gap=g, big_m=100.0, force_on=0.1, time_limit=900) for g in MIP_GAPS]
+ _finalize(res)
+ doc += _print_table("ftINIT step-0: mip_gap (big_m=100, force_on=0.1)", res)
+
+ # big_m sweep (gap=0.001, force_on=0.1)
+ res = [cfg(f"big_m={int(b)}", mip_gap=0.001, big_m=b, force_on=0.1, time_limit=900) for b in BIG_MS]
+ _finalize(res)
+ doc += _print_table("ftINIT step-0: big_m (gap=0.001, force_on=0.1)", res,
+ "big_m caps a scored reaction's flux; large values weaken the LP relaxation.")
+
+ # force_on sweep (gap=0.001, big_m=100) — changes the model (connectivity threshold)
+ res = [cfg(f"force_on={fo}", mip_gap=0.001, big_m=100.0, force_on=fo, time_limit=900) for fo in FORCE_ONS]
+ _finalize(res)
+ doc += _print_table("ftINIT step-0: force_on (gap=0.001, big_m=100)", res,
+ "force_on changes the *model* (min flux to count as 'on'), not just tolerance — "
+ "Jaccard here measures sensitivity, not error.")
+ return doc
+
+
+def sweep_prep_scale(ref, spont, custom, rxn_scores, store, save) -> list:
+ doc: list[str] = []
+ # One unscaled prep; rescale copies of its min_model for each setting.
+ base = prep_init_model(ref, ext_comp="e", spontaneous=spont, custom=custom, scale=False)
+ step, scores = _step0(base, rxn_scores)
+
+ def cfg(label, msd):
+ key = ("prep_scale", label)
+ if key not in store:
+ print(f"[prep_scale] {label} ...", flush=True)
+ mm = base.min_model.copy()
+ if msd is not None:
+ rescale_for_init(mm, msd)
+ # group_rxn_scores keys are merged ids — identical across copies, so reuse `scores`.
+ t = time.time()
+ try:
+ r = _run_step0(mm, scores, base, step, mip_gap=0.001, big_m=100.0,
+ force_on=0.1, time_limit=600)
+ except Exception as ex: # noqa: BLE001 (infeasible/intractable is a finding)
+ r = Result(label=label, seconds=time.time() - t, status=f"FAIL:{type(ex).__name__}",
+ objective=0.0, n_kept=0)
+ r.label = label
+ store[key] = r
+ save()
+ return store[key]
+
+ res = [cfg("scale=on,msd=25", 25.0)] # reference = production default
+ res += [cfg(f"msd={int(m)}", m) for m in MAX_STOICH if m != 25.0]
+ res.append(cfg("scale=off", None))
+ _finalize(res)
+ doc += _print_table("prep scaling: rescaleModelForINIT max_stoich_diff (+scale off), big_m=100", res,
+ "With big_m=100 fixed, scale=off / poor conditioning is expected to be "
+ "infeasible or far slower — that is the reason scaling is on by default.")
+ return doc
+
+
+def sweep_tinit(ref, rxn_scores, store, save) -> list:
+ doc: list[str] = []
+ ess: list[str] = []
+
+ def cfg(label, **kw):
+ key = ("tinit", label)
+ if key not in store:
+ print(f"[tinit] {label} ...", flush=True)
+ t = time.time()
+ try:
+ out = get_init_model(ref, rxn_scores=rxn_scores, essential_rxns=ess, **kw)
+ r = Result(label=label, seconds=time.time() - t, status="ok",
+ objective=0.0, n_kept=len(out.model.reactions),
+ reactions=sorted(x.id for x in out.model.reactions))
+ except Exception as ex: # noqa: BLE001
+ r = Result(label=label, seconds=time.time() - t, status=f"FAIL:{type(ex).__name__}",
+ objective=0.0, n_kept=0)
+ store[key] = r
+ save()
+ return store[key]
+
+ tl = 400 # tight time limit so the sweep is affordable
+ res = [cfg(f"gap={g}", eps=1.0, prod_weight=0.5, mip_gap=g, time_limit=tl) for g in (0.001, 0.003, 0.01)]
+ _finalize(res)
+ doc += _print_table(f"tINIT: mip_gap (eps=1, prod_weight=0.5, time_limit={tl}s)", res)
+
+ res = [cfg(f"eps={e}", eps=e, prod_weight=0.5, mip_gap=0.005, time_limit=tl) for e in EPS_VALS]
+ _finalize(res)
+ doc += _print_table("tINIT: eps (gap=0.005) — connectivity flux threshold (changes the model)", res)
+
+ res = [cfg(f"prodw={p}", eps=1.0, prod_weight=p, mip_gap=0.005, time_limit=tl) for p in PROD_WEIGHTS]
+ _finalize(res)
+ doc += _print_table("tINIT: prod_weight (gap=0.005) — metabolite-production reward (changes the model)", res)
+
+ res = [cfg("big_m=ub(None)", eps=1.0, prod_weight=0.5, mip_gap=0.005, time_limit=tl, big_m=None)]
+ res += [cfg(f"big_m={int(b)}", eps=1.0, prod_weight=0.5, mip_gap=0.005, time_limit=tl, big_m=b)
+ for b in (1000.0, 250.0, 100.0)]
+ _finalize(res)
+ doc += _print_table("tINIT: big_m (gap=0.005) — None=per-reaction ub (no rescale on tINIT)", res)
+ return doc
+
+
+def sweep_ftinit_full(prep, rxn_scores, gene_scores, store, save) -> list:
+ doc: list[str] = []
+
+ def cfg(label, **kw):
+ key = ("ftinit_full", label)
+ if key not in store:
+ print(f"[ftinit_full] {label} ...", flush=True)
+ t = time.time()
+ try:
+ out = ftinit(prep, rxn_scores, gene_scores=gene_scores, series="1+1", **kw)
+ r = Result(label=label, seconds=time.time() - t, status="ok",
+ objective=0.0, n_kept=len(out.reactions),
+ reactions=sorted(x.id for x in out.reactions))
+ except Exception as ex: # noqa: BLE001
+ r = Result(label=label, seconds=time.time() - t, status=f"FAIL:{type(ex).__name__}",
+ objective=0.0, n_kept=0)
+ store[key] = r
+ save()
+ return store[key]
+
+ res = [cfg(f"gap={g}", mip_gap=g, time_limit=600) for g in (0.001, 0.003, 0.01)]
+ res += [cfg(f"big_m={int(b)}", mip_gap=0.003, big_m=b, time_limit=600) for b in (50.0, 250.0)]
+ _finalize(res)
+ doc += _print_table("ftINIT full pipeline ('1+1'): mip_gap & big_m — final model size/stability", res)
+ return doc
+
+
+def main() -> None:
+ ap = argparse.ArgumentParser(description=__doc__,
+ formatter_class=argparse.RawDescriptionHelpFormatter)
+ ap.add_argument("--work", type=Path, default=Path.home() / "hgem_compare")
+ ap.add_argument("--human-gem", type=Path, default=Path.home() / "github" / "Human-GEM")
+ ap.add_argument("--cell", default="HCT116")
+ ap.add_argument("--sweeps", default="ftinit_milp,prep_scale,tinit,ftinit_full",
+ help="comma-separated subset of: ftinit_milp,prep_scale,tinit,ftinit_full")
+ ap.add_argument("--out", type=Path, default=None, help="results pickle (resumable)")
+ ap.add_argument("--doc", type=Path, default=None, help="write the markdown tables here")
+ args = ap.parse_args()
+
+ out = args.out or args.work / f"init_param_sweep_{args.cell}.pkl"
+ store: dict = pickle.load(open(out, "rb")) if out.exists() else {}
+
+ def save():
+ tmp = Path(f"{out}.part")
+ pickle.dump(store, open(tmp, "wb"))
+ tmp.replace(out)
+
+ sweeps = set(args.sweeps.split(","))
+ t0 = time.time()
+ ref, spont, custom, gene_scores, rxn_scores = _load_inputs(args.work, args.human_gem, args.cell)
+ print(f"[{time.time()-t0:.0f}s] loaded {len(ref.reactions)} rxns, cell={args.cell}", flush=True)
+
+ prep = None
+ if sweeps & {"ftinit_milp", "ftinit_full"}:
+ prep = prep_init_model(ref, ext_comp="e", spontaneous=spont, custom=custom, scale=True)
+ print(f"[{time.time()-t0:.0f}s] scaled prep: min_model {len(prep.min_model.reactions)} rxns",
+ flush=True)
+
+ doc: list[str] = [f"# (f)tINIT parameter calibration — Human-GEM / {args.cell}", "",
+ "Generated by `scripts/analyze_init_params.py`. Reference (first) row of each "
+ "tolerance sweep is the tightest setting; gaps/Jaccard are measured against it.", ""]
+ if "ftinit_milp" in sweeps:
+ doc += sweep_ftinit_milp(prep, rxn_scores, store, save)
+ if "prep_scale" in sweeps:
+ doc += sweep_prep_scale(ref, spont, custom, rxn_scores, store, save)
+ if "tinit" in sweeps:
+ doc += sweep_tinit(ref, rxn_scores, store, save)
+ if "ftinit_full" in sweeps:
+ doc += sweep_ftinit_full(prep, rxn_scores, gene_scores, store, save)
+
+ if args.doc:
+ args.doc.write_text("\n".join(doc) + "\n")
+ print(f"\nwrote {args.doc}", flush=True)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/analyze_init_robustness.py b/scripts/analyze_init_robustness.py
new file mode 100644
index 0000000..1cac79a
--- /dev/null
+++ b/scripts/analyze_init_robustness.py
@@ -0,0 +1,268 @@
+#!/usr/bin/env python3
+"""Robustness of (f)tINIT to degraded transcriptomics input (Phase 4d.7).
+
+The metabolic-task layer is *always part of the pipeline* — it is what makes the output a
+functional model. The experimental variable here is therefore the **transcriptomics
+input**, not whether tasks are used. This script holds the task + gap-fill layer fixed and
+asks: as the expression data gets noisier or sparser, (a) does the model stay functional,
+and (b) how much does the *reaction content* drift from what clean data would give — and
+which parameters keep it stable?
+
+Metrics, per run (tasks always on):
+
+* ``frac`` — fraction of essential metabolic tasks the model performs (``check_tasks``).
+ The task+gap-fill layer should hold this at 1.0; a drop is a real failure.
+* ``Jaccard`` — reaction-set overlap with the **clean-data** model. This is the real cost
+ of bad input: even when all tasks still pass, degraded data changes *which*
+ reactions are kept. The primary robustness signal.
+* ``n_rxns`` — model size (does degraded data bloat or shrink it).
+
+Three independent degradations of the gene-expression vector (severity = higher is worse):
+
+* ``dropout`` — set a random fraction of genes to 0 (→ gene score -5, a strong *remove*
+ signal). Simulates shallow sequencing / single-cell dropout.
+* ``noise`` — multiply each level by ``exp(N(0, sigma))`` (sigma = severity).
+* ``downsample`` — drop a random fraction of genes entirely (→ ``no_gene_score``).
+
+Two phases:
+
+* **gradient** — task pipeline across degradation levels; shows functional integrity and
+ reaction-set drift vs the clean-data model.
+* **levers** — at a fixed severe degradation, vary the robustness parameters
+ (``no_gene_score``, ``force_on``; ``prod_weight``/``eps`` for tINIT) to see which keeps
+ the model closest to the clean-data result / most functional.
+
+``--algo ftinit`` (default) or ``tinit``. Resumable; reuses the cached Human-GEM task prep
+(``rg_prep_tasks.pkl``). Loose MIP gap for speed (functionality + set overlap, not the
+exact optimum, are the metrics).
+
+Usage
+-----
+ python scripts/analyze_init_robustness.py --algo ftinit --cell HCT116
+"""
+from __future__ import annotations
+
+import argparse
+import pickle
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+
+import cobra
+import numpy as np
+
+from raven_python.init import (
+ ftinit,
+ gene_scores_from_expression,
+ get_init_model,
+ score_reactions_from_genes,
+)
+from raven_python.tasks import check_tasks, parse_task_list
+
+# Degradation grid (severity per kind). A mild and a severe point per kind.
+GRADIENT = {
+ "dropout": (0.5, 0.7), # moderate + severe-but-realistic (single-cell dropout ~50-70%);
+ "noise": (1.0, 2.0), # 90%+ dropout breaks ~all tasks so gap-fill rebuilds the model
+ "downsample": (0.5, 0.7), # (a per-task MILP each) — pathologically slow and unrealistic.
+}
+LEVER_KIND, LEVER_LEVEL = "dropout", 0.7 # severe-but-tractable point for the levers
+NO_GENE_SCORES = (-1.0, -0.5) # vs the default -2 (the gradient row)
+FORCE_ONS = (0.2,) # vs the default 0.1
+PROD_WEIGHTS = (0.0, 1.0, 2.0) # tINIT only (default 0.5)
+EPS_VALS = (0.5, 1.0) # tINIT only (gradient default 0.1; test higher)
+
+# Loose solver tolerances (speed; functionality + set overlap, not the exact optimum).
+MIP_GAP, TIME_LIMIT = 0.02, 120.0
+
+
+@dataclass
+class Result:
+ label: str
+ seconds: float
+ status: str
+ n_rxns: int
+ n_pass: int
+ n_tasks: int
+ frac_pass: float
+ reactions: list[str] = field(default_factory=list)
+ jaccard_clean: float | None = None
+
+
+def _jaccard(a: set[str], b: set[str]) -> float:
+ return len(a & b) / len(a | b) if (a or b) else 1.0
+
+
+def degrade(expr: dict[str, float], kind: str, level: float, seed: int) -> dict[str, float]:
+ """Return a degraded copy of the expression dict (severity ``level``)."""
+ if level <= 0:
+ return dict(expr)
+ rng = np.random.default_rng(seed)
+ genes = list(expr)
+ if kind == "dropout":
+ out = dict(expr)
+ for g in rng.choice(genes, size=int(level * len(genes)), replace=False):
+ out[g] = 0.0
+ return out
+ if kind == "noise":
+ return {g: max(v * float(np.exp(rng.normal(0.0, level))), 0.0) for g, v in expr.items()}
+ if kind == "downsample":
+ keep = set(rng.choice(genes, size=int((1 - level) * len(genes)), replace=False))
+ return {g: v for g, v in expr.items() if g in keep}
+ raise ValueError(f"unknown degradation kind {kind!r}")
+
+
+def functionality(model: cobra.Model, tasks) -> tuple[int, int]:
+ """(passed, total) essential tasks the extracted model can perform."""
+ results = check_tasks(model, tasks)
+ return sum(t.passed for t in results), len(results)
+
+
+def _measure(label, builder, tasks, clean_set=None) -> Result:
+ t = time.time()
+ try:
+ model = builder()
+ n_pass, n_tasks = functionality(model, tasks)
+ rset = sorted(x.id for x in model.reactions)
+ r = Result(label, time.time() - t, "ok", len(rset), n_pass, n_tasks,
+ n_pass / n_tasks if n_tasks else 0.0, rset)
+ if clean_set is not None:
+ r.jaccard_clean = _jaccard(set(rset), clean_set)
+ except Exception as ex: # noqa: BLE001 (infeasible/failed build is itself a finding)
+ msg = str(ex)[:80].replace("\n", " ") or type(ex).__name__
+ print(f" FAIL {label}: {type(ex).__name__}: {ex}", flush=True)
+ r = Result(label, time.time() - t, f"FAIL:{msg}", 0, 0, len(tasks), 0.0)
+ return r
+
+
+def _table(title, results, note="") -> list[str]:
+ lines = [f"### {title}", ""]
+ if note:
+ lines += [note, ""]
+ lines.append("| config | time (s) | status | n_rxns | tasks passed | frac | Jaccard vs clean |")
+ lines.append("|---|--:|---|--:|--:|--:|--:|")
+ for r in results:
+ jac = f"{r.jaccard_clean:.3f}" if r.jaccard_clean is not None else "ref"
+ lines.append(f"| {r.label} | {r.seconds:.0f} | {r.status} | {r.n_rxns} | "
+ f"{r.n_pass}/{r.n_tasks} | {r.frac_pass:.3f} | {jac} |")
+ lines.append("")
+ for ln in lines:
+ print(ln)
+ return lines
+
+
+def main() -> None:
+ ap = argparse.ArgumentParser(description=__doc__,
+ formatter_class=argparse.RawDescriptionHelpFormatter)
+ ap.add_argument("--work", type=Path, default=Path.home() / "hgem_compare")
+ ap.add_argument("--human-gem", type=Path, default=Path.home() / "github" / "Human-GEM")
+ ap.add_argument("--cell", default="HCT116")
+ ap.add_argument("--algo", choices=("ftinit", "tinit"), default="ftinit")
+ ap.add_argument("--phase", default="gradient,levers")
+ ap.add_argument("--seed", type=int, default=0)
+ ap.add_argument("--out", type=Path, default=None)
+ ap.add_argument("--doc", type=Path, default=None)
+ args = ap.parse_args()
+
+ out = args.out or args.work / f"init_robustness_{args.algo}_{args.cell}.pkl"
+ store: dict = pickle.load(open(out, "rb")) if out.exists() else {}
+
+ def save():
+ tmp = Path(f"{out}.part")
+ pickle.dump(store, open(tmp, "wb"))
+ tmp.replace(out)
+
+ def cached(key, fn):
+ if key not in store:
+ print(f"[{args.algo}] {key[1]} ...", flush=True)
+ store[key] = fn()
+ save()
+ return store[key]
+
+ t0 = time.time()
+ ref = cobra.io.read_sbml_model(str(args.work / "raven_refModel.xml"))
+ ref.solver = cobra.Configuration().solver
+ expr: dict[str, float] = {}
+ with open(args.human_gem / "data" / "datasets" / "Hart2015_RNAseq.txt") as f:
+ h = f.readline().rstrip("\n").split("\t")
+ c = h.index(args.cell)
+ for line in f:
+ p = line.rstrip("\n").split("\t")
+ expr[p[0]] = float(p[c])
+ tasks = parse_task_list(str(args.human_gem / "data" / "metabolicTasks" /
+ "metabolicTasks_Essential.txt"))
+ prep = pickle.load(open(args.work / "rg_prep_tasks.pkl", "rb")) # ftINIT uses task layer
+ task_layer_note = ("task layer always on" if args.algo == "ftinit"
+ else "essential_rxns=[] (tINIT lb=eps incompatible with many essentials)")
+ print(f"[{time.time()-t0:.0f}s] ref {len(ref.reactions)} rxns, {len(tasks)} tasks, "
+ f"cell={args.cell}, algo={args.algo} ({task_layer_note})", flush=True)
+
+ def model_for(e, **kw):
+ g = gene_scores_from_expression(e, 1.0)
+ r = score_reactions_from_genes(ref, g, no_gene_score=kw.get("no_gene_score", -2.0))
+ if args.algo == "ftinit":
+ return ftinit(prep, r, gene_scores=g, series="1+1",
+ force_on=kw.get("force_on", 0.1), mip_gap=MIP_GAP, time_limit=TIME_LIMIT)
+ # tINIT's essential_rxns are forced via lb=eps; >100 essentials simultaneously is
+ # infeasible at genome scale regardless of eps (see docs/init_param_calibration.md
+ # §1.5). tINIT is therefore run *without* essentials here — the realistic
+ # tINIT-without-gap-fill picture. Use a small default eps (0.1) all the same to
+ # avoid the unrelated connectivity-threshold over-constraint.
+ return get_init_model(ref, rxn_scores=r, essential_rxns=[],
+ prod_weight=kw.get("prod_weight", 0.5), eps=kw.get("eps", 0.1),
+ mip_gap=MIP_GAP, time_limit=TIME_LIMIT).model
+
+ phases = set(args.phase.split(","))
+ doc = [f"# (f)tINIT robustness to degraded transcriptomics — Human-GEM / {args.cell} / {args.algo}",
+ "", "Task + gap-fill layer is always on (it is part of the pipeline); the variable is the "
+ "expression input. Functional = fraction of essential tasks performed (check_tasks); "
+ "Jaccard is reaction-set overlap with the clean-data model. Generated by "
+ "`scripts/analyze_init_robustness.py`.", ""]
+
+ clean = cached(("clean", "clean"), lambda: _measure("clean", lambda: model_for(expr), tasks))
+ clean_set = set(clean.reactions)
+ clean.jaccard_clean = None # it is the reference
+ doc += _table("Clean-data baseline", [clean])
+
+ if "gradient" in phases:
+ for kind, levels in GRADIENT.items():
+ rows = [clean]
+ for lvl in levels:
+ e = degrade(expr, kind, lvl, args.seed)
+ rows.append(cached((f"grad_{kind}", f"{kind}={lvl}"), lambda e=e, lvl=lvl, kind=kind:
+ _measure(f"{kind}={lvl}", lambda: model_for(e), tasks, clean_set)))
+ doc += _table(f"Gradient: {kind} (task pipeline always on)", rows,
+ "Higher severity = noisier/sparser input. frac should stay ~1.0 (the task "
+ "layer's job); the Jaccard drop is how much degraded data changes the model.")
+
+ if "levers" in phases:
+ e = degrade(expr, LEVER_KIND, LEVER_LEVEL, args.seed)
+ tag = f"{LEVER_KIND}={LEVER_LEVEL}"
+ rows = []
+ if args.algo == "ftinit":
+ for ngs in NO_GENE_SCORES:
+ rows.append(cached(("lever", f"no_gene_score={ngs}"), lambda ngs=ngs:
+ _measure(f"no_gene_score={ngs}", lambda: model_for(e, no_gene_score=ngs),
+ tasks, clean_set)))
+ for fo in FORCE_ONS:
+ rows.append(cached(("lever", f"force_on={fo}"), lambda fo=fo:
+ _measure(f"force_on={fo}", lambda: model_for(e, force_on=fo),
+ tasks, clean_set)))
+ else:
+ for pw in PROD_WEIGHTS:
+ rows.append(cached(("lever", f"prod_weight={pw}"), lambda pw=pw:
+ _measure(f"prod_weight={pw}", lambda: model_for(e, prod_weight=pw),
+ tasks, clean_set)))
+ for ev in EPS_VALS:
+ rows.append(cached(("lever", f"eps={ev}"), lambda ev=ev:
+ _measure(f"eps={ev}", lambda: model_for(e, eps=ev), tasks, clean_set)))
+ doc += _table(f"Levers at {tag}: which parameter keeps the model closest to clean?", rows,
+ "Compare against the default-parameter row for this severity in the gradient "
+ "table above (no_gene_score=-2, force_on=0.1 / prod_weight=0.5, eps=1.0).")
+
+ if args.doc:
+ args.doc.write_text("\n".join(doc) + "\n")
+ print(f"\nwrote {args.doc}", flush=True)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/analyze_init_solvers.py b/scripts/analyze_init_solvers.py
new file mode 100644
index 0000000..e1e7b6c
--- /dev/null
+++ b/scripts/analyze_init_solvers.py
@@ -0,0 +1,150 @@
+#!/usr/bin/env python3
+"""Cross-solver benchmark for ftINIT on a genome-scale model (Phase 4d.7).
+
+The clean-data calibration and robustness studies tuned (and ran) on Gurobi. The CI
+``tests/test_init_solvers.py`` checks correctness on toy models for every installed MILP
+solver; this script measures **genome-scale tractability and reaction-set agreement** —
+does the same ftINIT pipeline that works in seconds on Gurobi also complete on HiGHS or
+GLPK, in what time, and producing the same model?
+
+For each installed MILP-capable optlang interface (Gurobi, ``hybrid`` for HiGHS, GLPK) it
+runs the *same* ftINIT call (cached Human-GEM no-task prep + HCT116 scores) with the same
+``mip_gap``/``time_limit``, records (status, wall time, reaction set), and computes the
+pairwise Jaccard of the resulting reaction sets. Solvers that fail (the optlang
+``hybrid_interface`` ``clone`` bug, or GLPK timing out at genome scale) are recorded as
+such — that *is* the cross-solver picture.
+
+Usage
+-----
+ python scripts/analyze_init_solvers.py --cell HCT116 --time-limit 900 \
+ --doc docs/init_solver_benchmark.md
+"""
+from __future__ import annotations
+
+import argparse
+import importlib.util
+import pickle
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+
+import cobra
+
+from raven_python.init import ftinit, gene_scores_from_expression, score_reactions_from_genes
+
+_INTERFACES = {"gurobi": "gurobi_interface", "hybrid": "hybrid_interface", "glpk": "glpk_interface"}
+
+
+def _available_solvers() -> list[str]:
+ return [name for name, mod in _INTERFACES.items()
+ if importlib.util.find_spec(f"optlang.{mod}") is not None]
+
+
+@dataclass
+class Result:
+ solver: str
+ seconds: float
+ status: str
+ n_rxns: int
+ reactions: list[str] = field(default_factory=list)
+
+
+def _jaccard(a: set[str], b: set[str]) -> float:
+ return len(a & b) / len(a | b) if (a or b) else 1.0
+
+
+def main() -> None:
+ ap = argparse.ArgumentParser(description=__doc__,
+ formatter_class=argparse.RawDescriptionHelpFormatter)
+ ap.add_argument("--work", type=Path, default=Path.home() / "hgem_compare")
+ ap.add_argument("--human-gem", type=Path, default=Path.home() / "github" / "Human-GEM")
+ ap.add_argument("--cell", default="HCT116")
+ ap.add_argument("--mip-gap", type=float, default=0.001)
+ ap.add_argument("--time-limit", type=float, default=900.0)
+ ap.add_argument("--out", type=Path, default=None)
+ ap.add_argument("--doc", type=Path, default=None)
+ args = ap.parse_args()
+
+ out = args.out or args.work / f"init_solver_bench_{args.cell}.pkl"
+ store: dict = pickle.load(open(out, "rb")) if out.exists() else {}
+
+ def save():
+ tmp = Path(f"{out}.part")
+ pickle.dump(store, open(tmp, "wb"))
+ tmp.replace(out)
+
+ expr: dict[str, float] = {}
+ with open(args.human_gem / "data" / "datasets" / "Hart2015_RNAseq.txt") as f:
+ h = f.readline().rstrip("\n").split("\t")
+ c = h.index(args.cell)
+ for line in f:
+ p = line.rstrip("\n").split("\t")
+ expr[p[0]] = float(p[c])
+ if not (args.work / "rg_prep.pkl").exists():
+ raise SystemExit(f"missing prep at {args.work / 'rg_prep.pkl'} — run the validation first")
+
+ solvers = _available_solvers()
+ print(f"available MILP solvers: {solvers}", flush=True)
+
+ def run(solver: str) -> Result:
+ if solver in store:
+ print(f"[{solver}] cached, skip", flush=True)
+ return store[solver]
+ print(f"[{solver}] running ...", flush=True)
+ t = time.time()
+ try:
+ # Fresh ref + prep load per solver so a broken interface (e.g. the optlang
+ # hybrid_interface clone bug at .solver=) doesn't pollute the next solver's state.
+ ref = cobra.io.read_sbml_model(str(args.work / "raven_refModel.xml"))
+ ref.solver = solver
+ local_prep = pickle.load(open(args.work / "rg_prep.pkl", "rb"))
+ local_prep.min_model.solver = solver
+ g = gene_scores_from_expression(expr, 1.0)
+ r = score_reactions_from_genes(ref, g)
+ model = ftinit(local_prep, r, gene_scores=g, series="1+1",
+ mip_gap=args.mip_gap, time_limit=args.time_limit)
+ rset = sorted(x.id for x in model.reactions)
+ res = Result(solver, time.time() - t, "ok", len(rset), rset)
+ except Exception as ex: # noqa: BLE001 - failure mode is the finding
+ res = Result(solver, time.time() - t,
+ f"FAIL:{type(ex).__name__}: {str(ex)[:80]}", 0, [])
+ store[solver] = res
+ save()
+ return res
+
+ results: dict[str, Result] = {s: run(s) for s in solvers}
+
+ # Reporting.
+ lines = [f"# Cross-solver ftINIT benchmark — Human-GEM / {args.cell}", "",
+ f"Same `ftinit()` call (no-task scaled prep; `mip_gap={args.mip_gap}`, "
+ f"`time_limit={args.time_limit}s`) run with each installed MILP-capable "
+ f"optlang interface. Generated by `scripts/analyze_init_solvers.py`.", "",
+ "## Per-solver result", "",
+ "| solver | time (s) | status | n_rxns |",
+ "|--------|---------:|--------|-------:|"]
+ for s, r in results.items():
+ lines.append(f"| {s} | {r.seconds:.0f} | {r.status} | {r.n_rxns} |")
+ lines.append("")
+
+ ok = {s: r for s, r in results.items() if r.status == "ok" and r.reactions}
+ if len(ok) >= 2:
+ lines += ["## Reaction-set agreement (Jaccard)", "",
+ "| solvers | shared | only A | only B | Jaccard |",
+ "|---------|-------:|-------:|-------:|--------:|"]
+ names = sorted(ok)
+ for i, a in enumerate(names):
+ for b in names[i + 1:]:
+ sa, sb = set(ok[a].reactions), set(ok[b].reactions)
+ lines.append(f"| {a} vs {b} | {len(sa & sb)} | {len(sa - sb)} | "
+ f"{len(sb - sa)} | {_jaccard(sa, sb):.3f} |")
+ lines.append("")
+
+ text = "\n".join(lines) + "\n"
+ print(text)
+ if args.doc:
+ args.doc.write_text(text)
+ print(f"wrote {args.doc}", flush=True)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/build_kegg_artefacts.py b/scripts/build_kegg_artefacts.py
new file mode 100644
index 0000000..13fd00e
--- /dev/null
+++ b/scripts/build_kegg_artefacts.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+"""Build the publishable KEGG artefact set for one release (maintainer-side).
+
+Runs the maintainer pipeline against an arranged KEGG dump (see
+``download_kegg_dump`` / ``fetch_keggdb``):
+
+* 3b.2 — ``parse_kegg_dump`` → ``reference_model.yml.gz`` + the gzipped-TSV tables;
+* 3b.3 — ``build_hmm_library`` per domain → a pressed ``.hmm`` (+ hmmpress
+ sidecars), named so :func:`raven_python.data.ensure_kegg_hmm_library` can fetch them.
+
+Everything lands in ``--out`` ready to upload as release assets; feed that
+directory to ``scripts/make_registry_snippet.py data`` to emit the registry entry.
+
+Examples
+--------
+Tables + reference model only (fast, no binaries)::
+
+ python scripts/build_kegg_artefacts.py --keggdb keggdb --out artefacts
+
+Full build incl. both HMM libraries (slow; needs HMMER/MAFFT/CD-HIT)::
+
+ python scripts/build_kegg_artefacts.py --keggdb keggdb --out artefacts \\
+ --hmms --threads 8
+"""
+from __future__ import annotations
+
+import argparse
+import shutil
+from pathlib import Path
+
+from raven_python.reconstruction.kegg import (
+ build_hmm_library,
+ parse_kegg_dump,
+ read_kegg_table,
+)
+
+# hmmpress sidecar extensions, alongside the .hmm.
+_HMM_SIDECARS = (".h3f", ".h3i", ".h3m", ".h3p")
+
+
+def _publish_library(work: dict, out_dir: Path, domain: str) -> Path:
+ """Copy a built ``library.hmm`` (+ sidecars) to ``out_dir/.hmm``."""
+ library = work["library"]
+ if library is None:
+ raise SystemExit(f"No HMMs built for {domain!r}; nothing to publish.")
+ target = out_dir / f"{domain}.hmm"
+ shutil.copyfile(library, target)
+ for suffix in _HMM_SIDECARS:
+ sidecar = library.with_name(library.name + suffix)
+ if sidecar.exists():
+ shutil.copyfile(sidecar, target.with_name(target.name + suffix))
+ return target
+
+
+def main(argv: list[str] | None = None) -> None:
+ parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+ parser.add_argument("--keggdb", required=True, type=Path, help="arranged KEGG dump directory")
+ parser.add_argument("--out", required=True, type=Path, help="artefact output directory")
+ parser.add_argument("--hmms", action="store_true", help="also build the HMM libraries")
+ parser.add_argument(
+ "--domains", nargs="+", default=["prokaryotes", "eukaryotes"], help="HMM domains to build"
+ )
+ parser.add_argument("--threads", type=int, default=1)
+ parser.add_argument("--seq-identity", type=float, default=0.9, help="CD-HIT identity (-1 skips)")
+ parser.add_argument(
+ "--parttree-residues", type=int, default=None,
+ help="total-residue budget above which MAFFT uses PartTree (default 1M, tuned "
+ "for ~7 GB RAM; raise on machines with more memory)",
+ )
+ args = parser.parse_args(argv)
+
+ args.out.mkdir(parents=True, exist_ok=True)
+ print(">>> Parsing KEGG dump (3b.2)...")
+ paths = parse_kegg_dump(args.keggdb, args.out)
+ for name, path in paths.items():
+ print(f" {name}: {path}")
+
+ if args.hmms:
+ ogk = read_kegg_table(paths["organism_gene_ko"])
+ genes_pep = args.keggdb / "genes.pep"
+ taxonomy = args.keggdb / "taxonomy"
+ for domain in args.domains:
+ print(f">>> Building HMM library for {domain} (3b.3)...")
+ work = build_hmm_library(
+ ogk, genes_pep, taxonomy, args.out / f"_hmms-{domain}",
+ domain=domain, seq_identity=args.seq_identity,
+ parttree_residues=args.parttree_residues, threads=args.threads,
+ )
+ published = _publish_library(work, args.out, domain)
+ print(f" {domain}: {published} ({len(work['hmms'])} profiles)")
+
+ print(f"\n>>> Done. Upload the contents of {args.out} as release assets, then run:")
+ print(" python scripts/make_registry_snippet.py data --dataset kegg "
+ f"--version --dir {args.out} --base-url ")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/make_registry_snippet.py b/scripts/make_registry_snippet.py
new file mode 100644
index 0000000..3efa49e
--- /dev/null
+++ b/scripts/make_registry_snippet.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python
+"""Emit ready-to-paste registry entries for published artefacts / binary ZIPs.
+
+Computes the SHA256 of each file and prints the Python/JSON entry to merge into
+``raven_python.data._DATA_REGISTRY`` (data artefacts) or ``raven_python.binaries._REGISTRY``
+(binary bundles). Run once per release, after uploading the files to the release.
+
+Examples
+--------
+Data artefacts (KEGG reference model + tables + HMM libraries) for one release::
+
+ python scripts/make_registry_snippet.py data \\
+ --dataset kegg --version kegg116 --dir artefacts \\
+ --base-url https://github.com/ORG/raven_python/releases/download/kegg-data-kegg116
+
+Binary bundle (one ZIP per platform, named ``---.zip``)::
+
+ python scripts/make_registry_snippet.py binary \\
+ --bundle blast --version 2.16.0 --provides blastp makeblastdb --dir zips \\
+ --base-url https://github.com/ORG/raven_python/releases/download/blast-2.16.0
+
+The SHA256 helper is shared with the runtime resolvers (``raven_python.binaries``), so
+published checksums always match what ``ensure_data`` / ``ensure_binary`` verify.
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+from raven_python.binaries import _sha256
+
+
+def _files_in(directory: Path) -> list[Path]:
+ """Regular, non-hidden files in ``directory``, sorted by name."""
+ return sorted(p for p in directory.iterdir() if p.is_file() and not p.name.startswith("."))
+
+
+def data_entry(dataset: str, version: str, base_url: str, directory: Path) -> dict:
+ """Build the ``_DATA_REGISTRY[dataset]`` entry for every file in ``directory``."""
+ base = base_url.rstrip("/")
+ files = {
+ p.name: {"url": f"{base}/{p.name}", "sha256": _sha256(p)} for p in _files_in(directory)
+ }
+ if not files:
+ raise SystemExit(f"No files found in {directory}")
+ return {"version": version, "files": files}
+
+
+def binary_entry(
+ bundle: str, version: str, provides: list[str], base_url: str, directory: Path
+) -> dict:
+ """Build the ``_REGISTRY[bundle]`` entry from ``---.zip``."""
+ base = base_url.rstrip("/")
+ prefix = f"{bundle}-{version}-"
+ platforms = {}
+ for zip_path in directory.glob(f"{prefix}*.zip"):
+ platform = zip_path.name[len(prefix) : -len(".zip")]
+ platforms[platform] = {"url": f"{base}/{zip_path.name}", "sha256": _sha256(zip_path)}
+ if not platforms:
+ raise SystemExit(f"No {prefix}*.zip files found in {directory}")
+ return {"version": version, "provides": provides, "platforms": dict(sorted(platforms.items()))}
+
+
+def render(key: str, entry: dict) -> str:
+ """Render ``{key: entry}`` as an indented JSON block (valid Python to paste)."""
+ return json.dumps({key: entry}, indent=4)
+
+
+def main(argv: list[str] | None = None) -> None:
+ parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+ sub = parser.add_subparsers(dest="kind", required=True)
+
+ d = sub.add_parser("data", help="data-artefact registry entry (raven_python.data)")
+ d.add_argument("--dataset", required=True, help="dataset key, e.g. 'kegg'")
+ d.add_argument("--version", required=True)
+ d.add_argument("--dir", required=True, type=Path, help="directory of uploaded artefacts")
+ d.add_argument("--base-url", required=True, help="release download URL prefix")
+
+ b = sub.add_parser("binary", help="binary-bundle registry entry (raven_python.binaries)")
+ b.add_argument("--bundle", required=True, help="bundle key, e.g. 'blast'")
+ b.add_argument("--version", required=True)
+ b.add_argument("--provides", nargs="+", required=True, help="executables the bundle provides")
+ b.add_argument("--dir", required=True, type=Path, help="directory of uploaded ZIPs")
+ b.add_argument("--base-url", required=True, help="release download URL prefix")
+
+ args = parser.parse_args(argv)
+ if args.kind == "data":
+ key, entry = args.dataset, data_entry(args.dataset, args.version, args.base_url, args.dir)
+ target = "raven_python/data.py _DATA_REGISTRY"
+ else:
+ key = args.bundle
+ entry = binary_entry(args.bundle, args.version, args.provides, args.base_url, args.dir)
+ target = "raven_python/binaries.py _REGISTRY"
+
+ print(f"# Merge into {target}:", file=sys.stderr)
+ print(render(key, entry))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/src/raven_python/__init__.py b/src/raven_python/__init__.py
new file mode 100644
index 0000000..591c4c1
--- /dev/null
+++ b/src/raven_python/__init__.py
@@ -0,0 +1,10 @@
+"""raven_python — Python counterpart of the RAVEN Toolbox, built on cobrapy.
+
+raven_python reuses cobrapy for simulation, standard analyses, SBML I/O, and model
+manipulation, and provides the RAVEN-specific functionality on top: de novo
+reconstruction (KEGG / homology), context-specific modeling (tINIT / ftINIT),
+metabolic task validation, connectivity gap-filling, omics integration (HPA),
+sub-cellular localisation, N-model comparison, and the RAVEN-style I/O formats.
+"""
+
+__version__ = "0.0.1"
diff --git a/src/raven_python/binaries.py b/src/raven_python/binaries.py
new file mode 100644
index 0000000..2d4b5a2
--- /dev/null
+++ b/src/raven_python/binaries.py
@@ -0,0 +1,148 @@
+"""Locate and provision external command-line binaries (BLAST+, DIAMOND, …).
+
+Shared across tools (not homology-specific). Resolution order for any executable:
+
+ explicit path arg → env var (RAVEN_PYTHON_) → shutil.which (PATH)
+ → ensure_binary (download the version-pinned ZIP from a raven_python release,
+ verify SHA256, cache, return the path)
+ → FileNotFoundError with install guidance
+
+So a pre-installed/conda binary always wins; the bundled ZIP is the zero-setup
+fallback. See docs/maintaining_binaries.md for how the release ZIPs and the
+registry are produced and updated.
+"""
+from __future__ import annotations
+
+import hashlib
+import os
+import platform
+import shutil
+import zipfile
+from pathlib import Path
+from urllib.request import urlopen
+
+# Registry of bundled binaries. Empty until release ZIPs are published; populated
+# per docs/maintaining_binaries.md. Keyed by *bundle*; one bundle can provide
+# several executables (e.g. "blast" -> blastp + makeblastdb).
+# bundle -> {version, provides:[exe...], platforms:{"-": {url, sha256}}}
+_REGISTRY: dict = {}
+
+# Environment variable overrides per executable.
+_ENV_VARS = {
+ "diamond": "RAVEN_PYTHON_DIAMOND",
+ "blastp": "RAVEN_PYTHON_BLASTP",
+ "makeblastdb": "RAVEN_PYTHON_MAKEBLASTDB",
+ "hmmbuild": "RAVEN_PYTHON_HMMBUILD",
+ "hmmpress": "RAVEN_PYTHON_HMMPRESS",
+ "hmmsearch": "RAVEN_PYTHON_HMMSEARCH",
+ "hmmscan": "RAVEN_PYTHON_HMMSCAN",
+ "mafft": "RAVEN_PYTHON_MAFFT",
+ "cd-hit": "RAVEN_PYTHON_CDHIT",
+}
+
+
+def platform_key() -> str:
+ """Return the ``-`` key used in the registry (e.g. ``linux-x86_64``)."""
+ system = {"linux": "linux", "darwin": "macos", "windows": "windows"}.get(
+ platform.system().lower(), platform.system().lower()
+ )
+ machine = platform.machine().lower()
+ arch = {"x86_64": "x86_64", "amd64": "x86_64", "arm64": "arm64", "aarch64": "arm64"}.get(
+ machine, machine
+ )
+ return f"{system}-{arch}"
+
+
+def _cache_dir() -> Path:
+ base = os.environ.get("XDG_CACHE_HOME") or (Path.home() / ".cache")
+ return Path(base) / "raven_python" / "binaries"
+
+
+def _bundle_for(executable: str, registry: dict):
+ for name, bundle in registry.items():
+ if executable in bundle.get("provides", []):
+ return name, bundle
+ return None, None
+
+
+def _sha256(path: Path) -> str:
+ h = hashlib.sha256()
+ with open(path, "rb") as fh:
+ for chunk in iter(lambda: fh.read(1 << 20), b""):
+ h.update(chunk)
+ return h.hexdigest()
+
+
+def ensure_binary(executable: str, *, registry: dict | None = None) -> Path:
+ """Download (if needed) and return the path to a bundled ``executable``.
+
+ Consults the registry for the current platform, downloads the pinned ZIP,
+ verifies its SHA256, extracts it into the cache, and returns the executable
+ path. Raises ``FileNotFoundError`` if no bundle for this platform is hosted.
+ """
+ registry = _REGISTRY if registry is None else registry
+ bundle_name, bundle = _bundle_for(executable, registry)
+ if bundle is None:
+ raise FileNotFoundError(
+ f"No bundled binary registered for {executable!r}. Install it (e.g. "
+ f"`conda install -c bioconda {executable}`) or pass an explicit path."
+ )
+ key = platform_key()
+ entry = bundle.get("platforms", {}).get(key)
+ if entry is None:
+ raise FileNotFoundError(
+ f"No bundled {executable!r} for platform {key!r}. Install it "
+ f"(e.g. `conda install -c bioconda {executable}`), set "
+ f"{_ENV_VARS.get(executable, 'the binary path')}, or pass binary=."
+ )
+
+ dest_dir = _cache_dir() / f"{bundle_name}-{bundle['version']}-{key}"
+ exe = dest_dir / executable
+ if exe.exists():
+ return exe
+
+ dest_dir.mkdir(parents=True, exist_ok=True)
+ archive = dest_dir / "_download.zip"
+ # Download into a sibling .part file and rename on success — an interrupted
+ # download leaves the partial behind .part, never as a half-complete .zip
+ # that a later run might mistake for a finished one. Mirrors data.py.
+ part = archive.with_suffix(archive.suffix + ".part")
+ try:
+ with urlopen(entry["url"]) as resp, open(part, "wb") as out: # noqa: S310
+ shutil.copyfileobj(resp, out)
+ digest = _sha256(part)
+ if digest != entry["sha256"]:
+ raise ValueError(
+ f"SHA256 mismatch for {executable!r} ({key}): "
+ f"expected {entry['sha256']}, got {digest}."
+ )
+ os.replace(part, archive)
+ finally:
+ part.unlink(missing_ok=True)
+ with zipfile.ZipFile(archive) as zf:
+ zf.extractall(dest_dir)
+ archive.unlink(missing_ok=True)
+ if not exe.exists():
+ raise FileNotFoundError(f"{executable!r} not found in the extracted bundle at {dest_dir}.")
+ exe.chmod(0o755)
+ return exe
+
+
+def resolve_binary(executable: str, *, binary: str | os.PathLike | None = None) -> str:
+ """Resolve an executable to a path: arg → env var → PATH → bundled ZIP → error."""
+ if binary is not None:
+ return os.fspath(binary)
+ env_var = _ENV_VARS.get(executable)
+ if env_var and os.environ.get(env_var):
+ return os.environ[env_var]
+ found = shutil.which(executable)
+ if found:
+ return found
+ try:
+ return os.fspath(ensure_binary(executable))
+ except FileNotFoundError as exc:
+ raise FileNotFoundError(
+ f"Could not find {executable!r}. Install it (e.g. "
+ f"`conda install -c bioconda {executable}`), put it on PATH, set "
+ f"{env_var or 'the binary path'}, or pass binary=. ({exc})"
+ ) from exc
diff --git a/src/raven_python/data.py b/src/raven_python/data.py
new file mode 100644
index 0000000..b1264be
--- /dev/null
+++ b/src/raven_python/data.py
@@ -0,0 +1,135 @@
+"""Fetch and cache published data artefacts (KEGG reference model, tables, HMMs).
+
+The mirror of :mod:`raven_python.binaries` for *data*: a version-pinned registry of
+downloadable artefacts, fetched on first use, SHA256-verified, and cached under
+platformdirs so end users never rebuild them from a KEGG dump (that is the
+maintainer's job — see docs/maintaining_kegg_data.md).
+
+Resolution for any artefact file:
+
+ explicit local dir → cached copy → download from the registry (verify,
+ cache) → FileNotFoundError with guidance
+
+The registry is **empty until the artefacts are published** (same as
+``binaries._REGISTRY``); until then ``ensure_data_file`` raises an actionable
+error. Cache layout::
+
+ $XDG_CACHE_HOME/raven_python/data/-/
+ (or ~/.cache/raven_python/data/... if XDG_CACHE_HOME is unset)
+"""
+from __future__ import annotations
+
+import os
+import shutil
+from pathlib import Path
+from urllib.request import urlopen
+
+from raven_python.binaries import _sha256
+
+# dataset -> {"version": str, "files": {filename: {"url": str, "sha256": str}}}
+# Populated when raven_python publishes the KEGG artefacts as release assets.
+_DATA_REGISTRY: dict = {}
+
+# The core KEGG artefacts needed to build a model (no HMM libraries).
+CORE_KEGG_FILES = (
+ "reference_model.yml.gz",
+ "ko_reaction.tsv.gz",
+ "ko_names.tsv.gz",
+ "organism_gene_ko.tsv.xz",
+ "rxn_flags.tsv.gz",
+)
+
+
+def _data_cache_dir() -> Path:
+ base = os.environ.get("XDG_CACHE_HOME") or (Path.home() / ".cache")
+ return Path(base) / "raven_python" / "data"
+
+
+def _bundle(dataset: str, registry: dict) -> dict:
+ bundle = registry.get(dataset)
+ if bundle is None:
+ raise FileNotFoundError(
+ f"No data artefacts registered for {dataset!r}. Either pass a local "
+ f"directory of artefacts, or build them per docs/maintaining_kegg_data.md."
+ )
+ return bundle
+
+
+def ensure_data_file(
+ dataset: str,
+ filename: str,
+ *,
+ version: str | None = None,
+ registry: dict | None = None,
+) -> Path:
+ """Download (if needed) and return the cached path to one artefact file.
+
+ Looks the file up in the registry for ``dataset`` (at ``version`` or the
+ registry's default), downloads it to the version-pinned cache directory,
+ verifies its SHA256, and returns the path. Re-uses an already-cached copy.
+ """
+ registry = _DATA_REGISTRY if registry is None else registry
+ bundle = _bundle(dataset, registry)
+ ver = version or bundle["version"]
+ entry = bundle.get("files", {}).get(filename)
+ if entry is None:
+ raise FileNotFoundError(
+ f"{filename!r} is not registered for {dataset!r} {ver}. "
+ f"Available: {sorted(bundle.get('files', {}))}."
+ )
+
+ dest_dir = _data_cache_dir() / f"{dataset}-{ver}"
+ dest = dest_dir / filename
+ if dest.exists():
+ return dest
+
+ dest_dir.mkdir(parents=True, exist_ok=True)
+ tmp = dest.with_name(dest.name + ".part")
+ with urlopen(entry["url"]) as resp, open(tmp, "wb") as out: # noqa: S310 (trusted registry URLs)
+ shutil.copyfileobj(resp, out)
+ digest = _sha256(tmp)
+ if digest != entry["sha256"]:
+ tmp.unlink(missing_ok=True)
+ raise ValueError(
+ f"SHA256 mismatch for {dataset}/{filename} ({ver}): "
+ f"expected {entry['sha256']}, got {digest}."
+ )
+ tmp.replace(dest)
+ return dest
+
+
+def ensure_kegg_data(
+ *,
+ version: str | None = None,
+ files: tuple[str, ...] = CORE_KEGG_FILES,
+ registry: dict | None = None,
+) -> Path:
+ """Ensure the core KEGG artefacts are cached; return their directory.
+
+ Fetches each of ``files`` (default :data:`CORE_KEGG_FILES`) for the ``kegg``
+ dataset and returns the cache directory holding them — ready to pass as the
+ ``artefact_dir`` of :func:`get_kegg_model_for_organism_from_artefacts`.
+ """
+ registry = _DATA_REGISTRY if registry is None else registry
+ ver = version or _bundle("kegg", registry)["version"]
+ for filename in files:
+ ensure_data_file("kegg", filename, version=ver, registry=registry)
+ return _data_cache_dir() / f"kegg-{ver}"
+
+
+def ensure_kegg_hmm_library(
+ domain: str, *, version: str | None = None, registry: dict | None = None
+) -> Path:
+ """Ensure a domain HMM library (and its hmmpress index) is cached; return its path.
+
+ ``domain`` is ``"prokaryotes"`` or ``"eukaryotes"``. Fetches ``.hmm``
+ plus the ``hmmpress`` sidecar files (``.h3f/.h3i/.h3m/.h3p``) and returns the
+ path to the ``.hmm`` (the argument for :func:`run_hmmscan`).
+ """
+ registry = _DATA_REGISTRY if registry is None else registry
+ ver = version or _bundle("kegg", registry)["version"]
+ base = f"{domain}.hmm"
+ library = ensure_data_file("kegg", base, version=ver, registry=registry)
+ for suffix in (".h3f", ".h3i", ".h3m", ".h3p"):
+ ensure_data_file("kegg", base + suffix, version=ver, registry=registry)
+ return library
diff --git a/src/raven_python/gapfilling/__init__.py b/src/raven_python/gapfilling/__init__.py
new file mode 100644
index 0000000..747b293
--- /dev/null
+++ b/src/raven_python/gapfilling/__init__.py
@@ -0,0 +1,9 @@
+"""Connectivity gap-filling against template models.
+
+:func:`connect_blocked_reactions` adds the fewest (lowest-penalty) template reactions so
+reactions blocked in a draft can carry flux. For the other gap-fill flavour (fill until
+the objective is feasible) use ``cobra.flux_analysis.gapfill``.
+"""
+from raven_python.gapfilling.fill import GapFillResult, connect_blocked_reactions
+
+__all__ = ["GapFillResult", "connect_blocked_reactions"]
diff --git a/src/raven_python/gapfilling/fill.py b/src/raven_python/gapfilling/fill.py
new file mode 100644
index 0000000..ba3418d
--- /dev/null
+++ b/src/raven_python/gapfilling/fill.py
@@ -0,0 +1,172 @@
+"""Connectivity gap-filling: add the fewest template reactions so reactions that are
+*blocked* in a draft can carry flux.
+
+For the other gap-filling flavour (add the fewest template reactions until the model's
+own objective becomes feasible) use ``cobra.flux_analysis.gapfill`` — just align the
+template's metabolite ids to the draft first, since cobra matches by id.
+
+It solves an MILP: pick the minimum-penalty subset of template reactions such that the
+blocked (irreversible) draft reactions can carry flux at steady state. Template
+metabolites are matched to the draft by ``name[compartment]`` (via
+:func:`add_reactions_from_model`), so templates in a different identifier namespace
+than the model still work. Per-reaction ``scores`` (higher = prefer to include) map to
+RAVEN's ``rxnScores``; the MILP minimises the penalty ``-score`` (default penalty
+``1.0``, i.e. minimise the number of reactions added).
+"""
+from __future__ import annotations
+
+from collections.abc import Iterable
+from dataclasses import dataclass
+
+import cobra
+from cobra.flux_analysis import find_blocked_reactions, flux_variability_analysis
+
+from raven_python.manipulation.transfer import add_reactions_from_model
+
+
+@dataclass
+class GapFillResult:
+ """Outcome of a connectivity gap-fill.
+
+ ``added_reactions`` are the template reaction ids added to ``model``;
+ ``newly_connected`` are draft reactions that were blocked but can now carry flux;
+ ``cannot_connect`` are blocked reactions left unconnectable.
+ """
+
+ added_reactions: list[str]
+ newly_connected: list[str]
+ cannot_connect: list[str]
+ model: cobra.Model
+
+
+def _as_models(templates: cobra.Model | Iterable[cobra.Model]) -> list[cobra.Model]:
+ return [templates] if isinstance(templates, cobra.Model) else list(templates)
+
+
+def _merge_templates(model: cobra.Model, templates: list[cobra.Model]) -> tuple[cobra.Model, list[str]]:
+ """Copy every template reaction (new ones only) into a working copy of ``model``.
+
+ Returns the working model and the ids of the reactions that came from templates
+ (the gap-fill candidates). Metabolites are matched by ``name[compartment]``.
+ """
+ working = model.copy()
+ template_ids: list[str] = []
+ for template in templates:
+ new = [r.id for r in template.reactions if r.id not in working.reactions]
+ if new:
+ added = add_reactions_from_model(working, template, new, genes=False, note=None)
+ template_ids += [r.id for r in added]
+ return working, template_ids
+
+
+def _solve_min_templates(
+ working: cobra.Model,
+ template_ids: list[str],
+ *,
+ scores: dict[str, float] | None,
+ penalty: float,
+ allow_net_production: bool,
+) -> set[str] | None:
+ """MILP: minimum-penalty template reactions making ``working`` feasible.
+
+ The requirement (here, forced flux through the blocked reactions) must already be
+ imposed on ``working``. Returns the template reaction ids to keep, or ``None`` if
+ the problem is infeasible.
+ """
+ prob = working.problem
+ indicators: dict[str, object] = {}
+ extra = []
+ for rid in template_ids:
+ rxn = working.reactions.get_by_id(rid)
+ y = prob.Variable(f"_gf_keep_{rid}", type="binary")
+ indicators[rid] = y
+ # Flux is confined to [lb*y, ub*y]: zero unless the reaction is kept (y=1).
+ extra.append(prob.Constraint(rxn.flux_expression - rxn.upper_bound * y, ub=0, name=f"_gf_ub_{rid}"))
+ extra.append(prob.Constraint(rxn.flux_expression - rxn.lower_bound * y, lb=0, name=f"_gf_lb_{rid}"))
+ working.add_cons_vars(list(indicators.values()) + extra)
+
+ if allow_net_production: # relax steady state to Sv >= 0 (mets may accumulate)
+ for met in working.metabolites:
+ working.constraints[met.id].ub = None
+
+ def pen(rid: str) -> float:
+ return -scores[rid] if scores and rid in scores else penalty
+
+ working.objective = prob.Objective(
+ sum(pen(rid) * indicators[rid] for rid in template_ids), direction="min"
+ )
+ working.slim_optimize()
+ if working.solver.status != "optimal":
+ return None
+ return {rid for rid, y in indicators.items() if (y.primal or 0) > 0.5}
+
+
+def _build_filled(model: cobra.Model, templates: list[cobra.Model], chosen: set[str]) -> cobra.Model:
+ filled = model.copy()
+ remaining = set(chosen)
+ for template in templates:
+ ids = [r for r in remaining if r in template.reactions]
+ if ids:
+ add_reactions_from_model(filled, template, ids, genes=False, note="Added by connect_blocked_reactions")
+ remaining -= set(ids)
+ return filled
+
+
+def connect_blocked_reactions(
+ model: cobra.Model,
+ templates: cobra.Model | Iterable[cobra.Model],
+ *,
+ scores: dict[str, float] | None = None,
+ penalty: float = 1.0,
+ allow_net_production: bool = False,
+ eps: float = 1.0,
+) -> GapFillResult:
+ """Add template reactions so blocked draft reactions can carry flux.
+
+ Finds reactions that
+ cannot carry flux in ``model``, then adds the minimum-penalty set of template
+ reactions that lets the (irreversible) ones carry flux, and returns the filled
+ model. Like RAVEN, only irreversible blocked reactions are forced — reversible
+ ones can carry flux trivially in the split formulation, so forcing them is
+ uninformative.
+
+ For the *other* gap-filling flavour — adding reactions to make the model's
+ objective feasible — use ``cobra.flux_analysis.gapfill`` after aligning the
+ template's metabolite ids to the draft.
+
+ The draft is expected to have exchange reactions for its nutrients (otherwise most
+ reactions are trivially blocked).
+ """
+ templates = _as_models(templates)
+ blocked = set(find_blocked_reactions(model))
+ candidates = [r for r in blocked if model.reactions.get_by_id(r).lower_bound >= 0]
+
+ working, template_ids = _merge_templates(model, templates)
+
+ target: list[str] = []
+ if candidates:
+ fva = flux_variability_analysis(working, reaction_list=candidates, fraction_of_optimum=0.0)
+ # A reaction can be missing from the FVA frame if the solver dropped it
+ # (e.g. the reaction was eliminated upstream); treat that as "unreachable"
+ # rather than letting the KeyError propagate.
+ target = [
+ r for r in candidates
+ if r in fva.index and fva.at[r, "maximum"] > eps
+ ]
+
+ cannot = sorted(blocked - set(target))
+ if not target:
+ return GapFillResult([], [], cannot, model.copy())
+
+ for rid in target:
+ working.reactions.get_by_id(rid).lower_bound = eps
+ chosen = _solve_min_templates(
+ working, template_ids, scores=scores, penalty=penalty,
+ allow_net_production=allow_net_production,
+ )
+ if chosen is None:
+ raise RuntimeError(
+ "Gap-filling is infeasible: the blocked reactions cannot all carry flux "
+ "even with every template reaction added."
+ )
+ return GapFillResult(sorted(chosen), sorted(target), cannot, _build_filled(model, templates, chosen))
diff --git a/src/raven_python/init/__init__.py b/src/raven_python/init/__init__.py
new file mode 100644
index 0000000..040f299
--- /dev/null
+++ b/src/raven_python/init/__init__.py
@@ -0,0 +1,46 @@
+"""Context-specific model extraction (tINIT / ftINIT).
+
+tINIT:
+* :func:`run_init` — the classic INIT MILP.
+* :func:`score_reactions_from_genes` / :func:`gene_scores_from_expression` —
+ gene → reaction scoring (RNA-seq is the common upstream).
+* :func:`get_init_model` — the tINIT pipeline (dead-end removal + ``run_init``).
+
+ftINIT (faster, staged):
+* :func:`run_ftinit` — the single-step ftINIT MILP (continuous indicators for
+ positive-score reactions; binaries only on negatives — the speedup over ``run_init``).
+* :func:`ftinit` — the full pipeline (``prep_init_model`` → staged ``run_ftinit`` →
+ ``fill_tasks`` → ``remove_low_score_genes``).
+"""
+from raven_python.init.build import InitModelResult, get_init_model
+from raven_python.init.ftinit import FtInitResult, ftinit, run_ftinit
+from raven_python.init.genes import remove_low_score_genes
+from raven_python.init.init import InitResult, run_init
+from raven_python.init.merge import group_rxn_scores, merge_linear
+from raven_python.init.prep import PrepData, ReactionMasks, classify_reactions, prep_init_model
+from raven_python.init.score import gene_scores_from_expression, score_reactions_from_genes
+from raven_python.init.steps import InitStep, get_init_steps
+from raven_python.init.taskfill import TaskFillResult, fill_tasks
+
+__all__ = [
+ "FtInitResult",
+ "InitModelResult",
+ "InitResult",
+ "InitStep",
+ "PrepData",
+ "ReactionMasks",
+ "TaskFillResult",
+ "classify_reactions",
+ "fill_tasks",
+ "ftinit",
+ "gene_scores_from_expression",
+ "get_init_model",
+ "get_init_steps",
+ "group_rxn_scores",
+ "merge_linear",
+ "prep_init_model",
+ "remove_low_score_genes",
+ "run_ftinit",
+ "run_init",
+ "score_reactions_from_genes",
+]
diff --git a/src/raven_python/init/build.py b/src/raven_python/init/build.py
new file mode 100644
index 0000000..a0d0538
--- /dev/null
+++ b/src/raven_python/init/build.py
@@ -0,0 +1,113 @@
+"""tINIT model building — high-level pipeline.
+
+Turn expression-derived scores into reaction scores (via the GPR), drop reactions that
+cannot carry flux, then run the INIT MILP to extract a context-specific model. Pass
+gene scores (typically from :func:`gene_scores_from_expression` or one of the omics
+loaders) or reaction scores directly. ``essential_rxns`` are forced kept.
+
+For task-aware gap-filling on top of the resulting model, use ftINIT
+(:func:`raven_python.init.ftinit`); ``get_init_model`` itself does not run the task layer.
+"""
+from __future__ import annotations
+
+from collections.abc import Iterable, Mapping
+from dataclasses import dataclass
+
+import cobra
+from cobra.flux_analysis import find_blocked_reactions
+
+from raven_python.init.init import run_init
+from raven_python.init.score import score_reactions_from_genes
+
+
+@dataclass
+class InitModelResult:
+ """Result of :func:`get_init_model`."""
+
+ model: cobra.Model
+ reaction_scores: dict[str, float]
+ deleted_dead_end_reactions: list[str]
+ deleted_in_init: list[str]
+ met_production: dict[str, bool]
+ objective: float
+
+
+def get_init_model(
+ ref_model: cobra.Model,
+ *,
+ rxn_scores: Mapping[str, float] | None = None,
+ gene_scores: Mapping[str, float] | None = None,
+ isozyme_scoring: str = "max",
+ complex_scoring: str = "min",
+ no_gene_score: float = -2.0,
+ essential_rxns: Iterable[str] | None = None,
+ present_mets: Iterable[str] | None = None,
+ prod_weight: float = 0.5,
+ allow_excretion: bool = True,
+ no_rev_loops: bool = False,
+ remove_dead_ends: bool = True,
+ eps: float = 1.0,
+ big_m: float | None = None,
+ mip_gap: float | None = None,
+ time_limit: float | None = None,
+) -> InitModelResult:
+ """Extract a context-specific model with tINIT.
+
+ Provide either ``rxn_scores`` (reaction id → score) or ``gene_scores`` (gene id →
+ score, converted via the GPR with :func:`score_reactions_from_genes`). Reactions
+ that cannot carry flux (with exchanges open) are removed first unless
+ ``remove_dead_ends=False``; ``essential_rxns`` are kept regardless. The remaining
+ model is passed to :func:`run_init`.
+ """
+ if (rxn_scores is None) == (gene_scores is None):
+ raise ValueError("Provide exactly one of rxn_scores or gene_scores.")
+
+ model = ref_model.copy()
+ essential = set(essential_rxns or [])
+ if gene_scores is not None:
+ scores = score_reactions_from_genes(
+ model, gene_scores, isozyme_scoring=isozyme_scoring,
+ complex_scoring=complex_scoring, no_gene_score=no_gene_score,
+ )
+ else:
+ scores = dict(rxn_scores)
+
+ deleted_dead_end: list[str] = []
+ if remove_dead_ends:
+ # Identify and drop reactions that cannot carry flux even under the
+ # *most permissive* boundary regime: every metabolite open for excretion
+ # (when ``allow_excretion``) plus the exchange-opened FVA. That makes
+ # the pre-filter conservative — only reactions blocked under both lax
+ # and strict regimes are removed, so the strict run_init path never
+ # loses a candidate it could have used.
+ probe = model.copy()
+ original_ids = {r.id for r in model.reactions}
+ if allow_excretion:
+ has_boundary = {m.id for r in probe.boundary for m in r.metabolites}
+ for met in list(probe.metabolites):
+ if met.id not in has_boundary:
+ probe.add_boundary(met, type="demand")
+ blocked = set(find_blocked_reactions(probe, open_exchanges=True))
+ deleted_dead_end = sorted((blocked & original_ids) - essential)
+ model.remove_reactions(deleted_dead_end, remove_orphans=True)
+
+ result = run_init(
+ model, scores,
+ present_mets=present_mets,
+ essential_rxns=essential & {r.id for r in model.reactions},
+ prod_weight=prod_weight,
+ allow_excretion=allow_excretion,
+ no_rev_loops=no_rev_loops,
+ eps=eps,
+ big_m=big_m,
+ mip_gap=mip_gap,
+ time_limit=time_limit,
+ )
+ return InitModelResult(
+ model=result.model,
+ reaction_scores=scores,
+ deleted_dead_end_reactions=deleted_dead_end,
+ deleted_in_init=result.deleted_reactions,
+ met_production=result.met_production,
+ objective=result.objective,
+ )
diff --git a/src/raven_python/init/ftinit.py b/src/raven_python/init/ftinit.py
new file mode 100644
index 0000000..b355e45
--- /dev/null
+++ b/src/raven_python/init/ftinit.py
@@ -0,0 +1,328 @@
+"""The ftINIT MILP — the faster staged variant of INIT.
+
+ftINIT keeps tINIT's objective — pick the reaction subset best matching expression
+scores while staying flux-consistent — but with a cheaper MILP encoding that is the
+reason it is *fast*: a **positive-score reaction needs no binary**. Because the
+objective *maximises* ``Σ score·y`` with ``score > 0``, the optimiser pushes its
+continuous indicator ``y ∈ [0,1]`` to 1, and the gate ``net_flux ≥ force_on·y`` only
+lets ``y`` reach 1 if the reaction can actually carry flux. Only *negative*-score
+reactions need a true ``{0,1}`` binary (their indicator would otherwise sit at 0 for
+free). This roughly halves the integer count — the dominant MILP cost.
+
+Reaction categories (RAVEN's six), by score sign × reversibility:
+
+* **score 0** — left in the model, *not* in the problem: a free flux variable that can
+ carry flux for connectivity but is neither scored nor removable.
+* **positive, irreversible** — continuous ``y∈[0,1]``; ``v ≥ force_on·y``. No binary.
+* **positive, reversible** — split ``v = v⁺ − v⁻``; continuous ``y``; a single
+ direction binary keeps one of ``v⁺/v⁻`` at 0 (no fwd/back loop faking "on");
+ ``v⁺+v⁻ ≥ force_on·y``.
+* **negative, irreversible** — binary ``x∈{0,1}``; ``v ≤ ub·x``.
+* **negative, reversible** — split; binary ``x``; ``v⁺+v⁻ ≤ cap·x``.
+* **essential** — forced on (``v ≥ force_on_ess``); no indicator. Assumed already
+ oriented irreversible in its forced direction (``prepINITModel`` does this).
+
+Objective: **maximise** ``Σ score·indicator``. Unlike classic INIT
+(:func:`raven_python.init.run_init`), ftINIT does **not** reward production of every
+metabolite — ``prod_weight`` applies only to metabolomics-detected metabolites (not
+yet implemented; passing a non-empty ``metabolomics`` argument raises
+``NotImplementedError``). Connectivity comes solely from the flux gates plus any
+essential reactions. ``allow_excretion`` relaxes ``S·v = 0`` to ``≥ 0``; ``rem_pos_rev``
+drops positive reversible reactions from the problem (used in the staging schedule).
+
+Needs a MILP solver (cobra's configured optlang solver; only Gurobi is fully viable at
+genome scale — see ``docs/init_solver_benchmark.md``). Magic numbers
+(``force_on``/``force_on_ess`` = 0.1, ``big_m`` = 100) are exposed and scale-dependent;
+calibration tables are in ``docs/init_param_calibration.md``. ``big_m`` caps a *scored*
+reaction's flux in its on/off (direction) constraint — using a fixed 100 rather than
+the reaction's ±1000 bound keeps the LP relaxation tight (what makes the genome-scale
+MILP tractable). Free / essential reactions keep their real bounds.
+
+⚠️ **Loops.** The MILP has *no* loopless constraint: an internal
+thermodynamically-infeasible cycle is flux-consistent (``S·v = 0``), so if its
+reactions carry positive net score the optimiser will "include" them with no real
+exchange flux. RAVEN tolerates this — loop-free models come from the staged pipeline
++ exchange handling, and at genome scale real exchange reactions make such cycles not
+score-optimal. A loopless option could be layered on later if needed.
+"""
+from __future__ import annotations
+
+from collections.abc import Iterable, Mapping
+from dataclasses import dataclass, field
+
+import cobra
+from optlang.symbolics import Real, add, mul
+
+from raven_python.init.genes import remove_low_score_genes
+from raven_python.init.merge import group_rxn_scores
+from raven_python.init.steps import get_init_steps
+from raven_python.init.taskfill import fill_tasks
+
+_FORCE_ON = 0.1 # min flux for a reaction to count as "on" (RAVEN forceOnLim)
+_BIG_M = 100.0 # indicator/direction big-M cap on a *scored* reaction's flux (RAVEN's 100)
+
+
+@dataclass
+class FtInitResult:
+ """Result of :func:`run_ftinit`."""
+
+ model: cobra.Model
+ kept_reactions: list[str]
+ deleted_reactions: list[str]
+ fluxes: dict[str, float]
+ objective: float
+ on_reactions: set[str] = field(default_factory=set) # scored reactions turned on (indicator)
+
+
+def run_ftinit(
+ model: cobra.Model,
+ rxn_scores: Mapping[str, float] | None = None,
+ *,
+ essential_rxns: Iterable[str] | None = None,
+ essential_directions: Mapping[str, int] | None = None,
+ essential_force: Mapping[str, float] | None = None,
+ allow_excretion: bool = False,
+ rem_pos_rev: bool = False,
+ ignore_mets: Iterable[str] = (),
+ force_on: float = _FORCE_ON,
+ force_on_ess: float = _FORCE_ON,
+ big_m: float = _BIG_M,
+ mip_gap: float | None = None,
+ time_limit: float | None = None,
+) -> FtInitResult:
+ """Run the single-step ftINIT MILP and return the extracted model.
+
+ ``rxn_scores`` maps reaction id → score (default 0 → reaction left free in the
+ model, not scored or removable). ``essential_rxns`` are forced to carry flux
+ (≥ ``force_on_ess``); ``essential_directions`` maps an essential reaction id to
+ ``+1`` (forward) or ``-1`` (reverse) for the forced direction (default forward).
+ ``ignore_mets`` are metabolite **names** whose mass balance is dropped (RAVEN's
+ per-step "simple metabolite" removal, e.g. H2O/H+). See the module docstring for
+ the formulation. This is the single-step variant; the staged schedule
+ (:func:`raven_python.init.ftinit`) calls it per step.
+ """
+ scores = dict(rxn_scores or {})
+ essential = set(essential_rxns or [])
+ directions = dict(essential_directions or {})
+ essential_force = dict(essential_force or {})
+ ignore_met_names = set(ignore_mets)
+ prob = model.problem
+ opt = prob.Model()
+
+ variables: list = []
+ constraints: list = []
+ flux_terms: dict[str, list[tuple[object, float]]] = {} # rxn id -> [(var, sign)]
+ indicators: dict[str, tuple[object, float]] = {} # rxn id -> (indicator var, score)
+ free_or_essential: set[str] = set() # kept regardless of an indicator
+
+ def add_constraint(expr, **kw):
+ constraints.append(prob.Constraint(expr, **kw))
+
+ for rxn in model.reactions:
+ rid = rxn.id
+ lb, ub = rxn.lower_bound, rxn.upper_bound
+ score = float(scores.get(rid, 0.0))
+ if rem_pos_rev and score > 0 and lb < 0 < ub:
+ score = 0.0 # staging step 1: positive reversibles dropped from the problem
+
+ if rid in essential:
+ # Forced to carry flux in its forced direction (default forward); respect a
+ # stricter native bound if the model already forces more flux. The forced
+ # magnitude may be set per reaction (RAVEN's min(0.99·|prev flux|, 0.1), so
+ # a reaction is never forced above what it carried before).
+ force = essential_force.get(rid, force_on_ess) if essential_force else force_on_ess
+ if directions.get(rid, 1) >= 0:
+ forced = min(force, ub) # clamp to capacity so we never make lb > ub
+ v = prob.Variable(f"v_{rid}", lb=max(forced, lb, 0.0), ub=ub)
+ else: # reverse: flux ≤ -force
+ forced = min(force, -lb)
+ v = prob.Variable(f"v_{rid}", lb=lb, ub=min(-forced, ub))
+ variables.append(v)
+ flux_terms[rid] = [(v, 1.0)]
+ free_or_essential.add(rid)
+ continue
+
+ if score == 0.0: # free: carries flux for connectivity, not scored/removable
+ v = prob.Variable(f"v_{rid}", lb=lb, ub=ub)
+ variables.append(v)
+ flux_terms[rid] = [(v, 1.0)]
+ free_or_essential.add(rid)
+ continue
+
+ reversible = lb < 0 < ub
+ if reversible:
+ vp = prob.Variable(f"vp_{rid}", lb=0.0, ub=ub)
+ vn = prob.Variable(f"vn_{rid}", lb=0.0, ub=-lb)
+ variables += [vp, vn]
+ flux_terms[rid] = [(vp, 1.0), (vn, -1.0)]
+ total = vp + vn # |flux| (one of vp/vn pinned to 0 below), used by the gates
+ else: # single-direction: keep the model's own [lb, ub] (incl. any forced lb>0)
+ v = prob.Variable(f"v_{rid}", lb=lb, ub=ub)
+ variables.append(v)
+ flux_terms[rid] = [(v, 1.0)]
+ total = v if ub > 0 else -v # magnitude for a single-direction reaction
+
+ if score > 0:
+ y = prob.Variable(f"y_{rid}", lb=0.0, ub=1.0) # continuous indicator, no binary
+ variables.append(y)
+ indicators[rid] = (y, score)
+ add_constraint(total - force_on * y, lb=0.0, name=f"on_{rid}") # y=1 ⇒ |flux| ≥ force_on
+ if reversible: # one direction binary stops a fwd/back loop faking "on"
+ b = prob.Variable(f"b_{rid}", type="binary")
+ variables.append(b)
+ add_constraint(vp - big_m * b, ub=0.0, name=f"dirp_{rid}") # vp ≤ M·b
+ add_constraint(vn + big_m * b, ub=big_m, name=f"dirn_{rid}") # vn ≤ M·(1-b)
+ else: # score < 0
+ x = prob.Variable(f"x_{rid}", type="binary")
+ variables.append(x)
+ indicators[rid] = (x, score)
+ add_constraint(total - big_m * x, ub=0.0, name=f"off_{rid}") # flux>0 ⇒ x=1
+
+ # Steady state S·v {== 0 | >= 0}; ignored metabolites are left unbalanced.
+ # Build each metabolite's balance as a *flat* list of (coeff·sign)·var terms and sum
+ # it with optlang.symbolics.add. Python's builtin sum re-canonicalises a growing
+ # sympy expression at every step (O(n²)); for hub metabolites that appear in ~10³
+ # reactions that is minutes per constraint. add() builds the sum in one pass.
+ met_terms: dict = {m: [] for m in model.metabolites if m.name not in ignore_met_names}
+ for rxn in model.reactions:
+ terms = flux_terms[rxn.id]
+ for met, coeff in rxn.metabolites.items():
+ bucket = met_terms.get(met)
+ if bucket is None:
+ continue
+ for var, sign in terms:
+ bucket.append(mul([Real(coeff * sign), var]))
+ for termlist in met_terms.values():
+ if termlist:
+ add_constraint(add(termlist), lb=0.0, ub=None if allow_excretion else 0.0)
+
+ opt.add(variables + constraints)
+ opt.objective = prob.Objective(
+ add([mul([Real(score), ind]) for ind, score in indicators.values()]), direction="max"
+ )
+ if time_limit is not None:
+ opt.configuration.timeout = int(time_limit)
+ if mip_gap is not None:
+ try: # Gurobi-specific; harmless if the backend differs
+ opt.problem.Params.MIPGap = mip_gap
+ except Exception: # noqa: BLE001
+ pass
+ opt.optimize()
+ # Accept a near-optimal incumbent (when a MIP gap / time limit is set), as RAVEN does.
+ if opt.status not in ("optimal", "feasible", "suboptimal", "time_limit"):
+ raise RuntimeError(f"ftINIT MILP did not solve (status: {opt.status}).")
+
+ # RAVEN: a reaction is "on" iff its indicator ≥ 0.5 (positive indicators are
+ # continuous and can land fractionally when a reaction can carry only tiny flux).
+ on = {rid for rid, (ind, _) in indicators.items() if (ind.primal or 0.0) >= 0.5}
+ kept = free_or_essential | on
+ deleted = [r.id for r in model.reactions if r.id not in kept]
+ fluxes = {
+ rid: sum(sign * (var.primal or 0.0) for var, sign in terms)
+ for rid, terms in flux_terms.items()
+ }
+
+ out = model.copy()
+ out.remove_reactions(deleted, remove_orphans=True)
+ return FtInitResult(out, sorted(kept), sorted(deleted), fluxes,
+ float(opt.objective.value), on_reactions=on)
+
+
+def ftinit(
+ prep,
+ rxn_scores: Mapping[str, float],
+ *,
+ gene_scores: Mapping[str, float] | None = None,
+ series: str = "1+1",
+ steps=None,
+ fill_gaps: bool = True,
+ metabolomics: Iterable[str] | None = None,
+ force_on: float = _FORCE_ON,
+ big_m: float = _BIG_M,
+ mip_gap: float | None = None,
+ time_limit: float | None = None,
+) -> cobra.Model:
+ """Run the full ftINIT pipeline on prepData and return the context-specific model.
+
+ ``prep`` is a :class:`raven_python.init.PrepData`. ``rxn_scores`` maps **original**
+ reaction id → score (e.g. from :func:`score_reactions_from_genes` on the template).
+ Each step (:func:`raven_python.init.get_init_steps`) regroups scores under its
+ ``ignore_mask``, fixes the reactions turned on by earlier steps as essential (in
+ their flux direction), and solves :func:`run_ftinit` on the merged model. Reactions
+ never turned on (and not essential or left-in) are removed from the reference model;
+ exchange reactions are always kept (RAVEN re-adds them).
+
+ If ``fill_gaps`` and ``prep`` carries tasks, reactions are added back so every task
+ is feasible (:func:`raven_python.init.fill_tasks`). If ``gene_scores`` is given,
+ negative-scoring genes are pruned from the GPRs at the end
+ (:func:`raven_python.init.remove_low_score_genes`).
+
+ Essential reactions are forced to carry ``force_on`` (default 0.1) of flux in the
+ forced direction. On genome-scale models a stricter regime is needed (the previous
+ step's actual carried flux instead of a flat 0.1) — exposed via per-reaction
+ ``essential_force`` on :func:`run_ftinit`.
+
+ ``metabolomics`` (a list of detected metabolite names to reward producing) is
+ **not yet implemented**: the linear merge eliminates degree-2 detected metabolites,
+ so it needs a producer-group-mapping + negative-producer force-flux block — the
+ most intricate MILP piece, for the least-used input. Passing a non-empty value
+ raises ``NotImplementedError``.
+
+ ``mip_gap``/``time_limit`` are forwarded to each :func:`run_ftinit` solve. On
+ genome-scale models they are essential for tractability — see
+ ``docs/init_param_calibration.md`` for the calibration table.
+ """
+ if metabolomics:
+ raise NotImplementedError(
+ "metabolomics production-bonus is not yet implemented."
+ )
+ steps = steps if steps is not None else get_init_steps(series)
+ min_model, group_of = prep.min_model, prep.group_of
+
+ turned_on: dict[str, float] = {} # merged reaction id -> flux (accumulated)
+ left_in: set[str] = set() # merged reactions with score 0 in the last step
+ for step in steps:
+ to_zero = prep.masks.ignored(step.ignore_mask)
+ scores = group_rxn_scores(min_model, rxn_scores, prep.orig_rxn_ids,
+ prep.group_ids, to_zero)
+ essential = set(prep.essential_rxns) # pre-oriented forward (default direction)
+ directions: dict[str, int] = {}
+ ess_force: dict[str, float] = {}
+ if step.how_to_use_prev == "essential":
+ for rid, flux in turned_on.items():
+ essential.add(rid)
+ directions[rid] = 1 if flux >= 0 else -1
+ # never force more flux than the reaction carried before (RAVEN)
+ ess_force[rid] = min(abs(flux) * 0.99, force_on)
+ res = run_ftinit(
+ min_model, scores, essential_rxns=essential, essential_directions=directions,
+ essential_force=ess_force, allow_excretion=step.allow_met_secr,
+ rem_pos_rev=step.pos_rev_off, ignore_mets=step.mets_to_ignore,
+ force_on=force_on, force_on_ess=force_on, big_m=big_m,
+ mip_gap=mip_gap, time_limit=time_limit,
+ )
+ for rid in res.on_reactions:
+ turned_on[rid] = res.fluxes[rid]
+ left_in = {rid for rid, s in scores.items() if s == 0.0}
+
+ # Merged reactions to keep: turned on + permanently essential + left-in (score 0).
+ kept_min = set(turned_on) | set(prep.essential_rxns) | left_in
+ deleted_min = [r.id for r in min_model.reactions if r.id not in kept_min]
+
+ # Map deleted merged reactions back to all originals in their groups.
+ removed_groups = {group_of[rid] for rid in deleted_min if group_of[rid] != 0}
+ to_remove = {o for o in prep.orig_rxn_ids if group_of[o] and group_of[o] in removed_groups}
+ to_remove |= {rid for rid in deleted_min if group_of[rid] == 0} # unmerged
+ # Keep the surviving originals plus all exchange reactions (always re-added).
+ final_kept = (set(prep.orig_rxn_ids) - to_remove) | prep.masks.exchange
+
+ out = prep.ref_model.copy()
+ out.remove_reactions([r.id for r in out.reactions if r.id not in final_kept],
+ remove_orphans=True)
+
+ if fill_gaps and prep.tasks: # add reactions back so every task is feasible
+ out = fill_tasks(out, prep.ref_model, prep.tasks, rxn_scores=rxn_scores,
+ mip_gap=mip_gap, time_limit=time_limit).model
+ if gene_scores is not None: # prune negative-scoring genes from the GPRs
+ out, _ = remove_low_score_genes(out, gene_scores)
+ return out
diff --git a/src/raven_python/init/genes.py b/src/raven_python/init/genes.py
new file mode 100644
index 0000000..ceed3da
--- /dev/null
+++ b/src/raven_python/init/genes.py
@@ -0,0 +1,85 @@
+"""Prune low-scoring genes from a model — the last ftINIT step.
+
+Drop negative-scoring genes from each reaction's GPR, while
+respecting enzyme structure — genes joined by **OR** (isozymes) are candidates for
+removal, but at least one must remain (the least-negative if all are negative);
+genes joined by **AND** (complex subunits) are *not* removed individually, though a
+whole complex can be dropped as one isozyme alternative if its (aggregated) score is
+negative. Operates on cobra's GPR AST recursively, so nested rules like
+``G1 and (G2 or G3) and G4`` prune the inner isozyme group correctly.
+"""
+from __future__ import annotations
+
+import ast
+import statistics
+from collections.abc import Mapping
+
+import cobra
+from cobra.manipulation import remove_genes
+
+_AGG = {"min": min, "max": max, "median": statistics.median, "average": statistics.fmean}
+
+
+def _prune(node, scores, iso, cplx) -> tuple[str | None, float | None]:
+ """Return (pruned GPR string, aggregate score) for an AST node, or (None, None)."""
+ if isinstance(node, ast.Name):
+ return node.id, scores.get(node.id) # None = unscored (NaN: never removed)
+ if not isinstance(node, ast.BoolOp):
+ return None, None
+
+ children = [_prune(v, scores, iso, cplx) for v in node.values]
+ children = [(s, sc) for s, sc in children if s is not None]
+
+ if isinstance(node.op, ast.And): # complex: keep every subunit, prune nested ORs
+ kept = children
+ else: # OR / isozymes: drop negative-scoring alternatives, keep at least one
+ kept = [(s, sc) for s, sc in children if sc is None or sc >= 0]
+ if not kept: # all negative → keep the least-negative
+ kept = [max(children, key=lambda c: c[1])]
+
+ parts = [s for s, _ in kept]
+ score_vals = [sc for _, sc in kept if sc is not None]
+ agg = (cplx if isinstance(node.op, ast.And) else iso)
+ score = agg(score_vals) if score_vals else None
+ op = " and " if isinstance(node.op, ast.And) else " or "
+ text = parts[0] if len(parts) == 1 else "(" + op.join(parts) + ")"
+ return text, score
+
+
+def remove_low_score_genes(
+ model: cobra.Model,
+ gene_scores: Mapping[str, float],
+ *,
+ isozyme_scoring: str = "max",
+ complex_scoring: str = "min",
+) -> tuple[cobra.Model, list[str]]:
+ """Remove negative-scoring genes from GPRs (RAVEN ``removeLowScoreGenes``).
+
+ ``gene_scores`` maps gene id → score; genes absent from it are treated as unscored
+ (never removed). Returns ``(new_model, removed_gene_ids)`` — genes dropped from
+ *every* rule they were in (and thus from the model). ``isozyme_scoring`` /
+ ``complex_scoring`` aggregate alternative/subunit scores (``max``/``min`` default).
+
+ When all isozyme alternatives are negative the least-negative one is kept
+ **deterministically** (first on a tie), unlike RAVEN's random tie-break — same
+ quality, reproducible.
+ """
+ for name, value in (("isozyme_scoring", isozyme_scoring), ("complex_scoring", complex_scoring)):
+ if value not in _AGG:
+ raise ValueError(f"{name} must be one of {sorted(_AGG)}; got {value!r}.")
+ iso, cplx = _AGG[isozyme_scoring], _AGG[complex_scoring]
+
+ out = model.copy()
+ for rxn in out.reactions:
+ body = rxn.gpr.body
+ if body is None or not rxn.genes:
+ continue
+ pruned, _ = _prune(body, gene_scores, iso, cplx)
+ if pruned is not None:
+ rxn.gene_reaction_rule = pruned
+
+ used = {g.id for rxn in out.reactions for g in rxn.genes}
+ removed = sorted(g.id for g in out.genes if g.id not in used)
+ if removed:
+ remove_genes(out, removed, remove_reactions=False)
+ return out, removed
diff --git a/src/raven_python/init/init.py b/src/raven_python/init/init.py
new file mode 100644
index 0000000..f23e17a
--- /dev/null
+++ b/src/raven_python/init/init.py
@@ -0,0 +1,254 @@
+"""The INIT MILP — tINIT core.
+
+INIT (Agren et al., PLoS Comput Biol 2012) extracts a context-specific model: keep a
+flux-consistent subnetwork that maximises the summed score of *included* reactions
+(positive score = evidence to keep, negative = evidence to remove), optionally
+rewarding net production of metabolites.
+
+Formulation:
+
+* Reversible reactions are split into forward / reverse directed reactions (flux ≥ 0).
+* Each non-essential directed reaction gets a binary ``x`` (included ⇔ ``x=1``) with
+ ``eps·x ≤ v ≤ ub·x`` — included reactions must carry flux ≥ ``eps`` (connectivity),
+ excluded ones carry none.
+* Essential reactions (``essential_rxns``) are forced to carry flux (``v ≥ eps``) and
+ skip the binary.
+* ``no_rev_loops`` adds ``x_fwd + x_rev ≤ 1`` so a reversible reaction can't look
+ "connected" via an internal forward/back loop.
+* Steady state ``S·v = 0`` per metabolite; ``allow_excretion`` relaxes it to ``≥ 0``
+ (net production allowed). With ``prod_weight > 0`` a per-metabolite sink
+ ``s_m ∈ [0,1]`` is added and rewarded, giving a reason to include connectivity
+ reactions.
+* Objective: **maximise** ``Σ score·x + prod_weight·Σ s_m``.
+
+Needs a MILP solver (cobra's configured optlang solver). On genome-scale problems,
+Gurobi is the only backend that is fully usable today (see
+``docs/init_solver_benchmark.md``).
+
+**Parameter caveat — magic numbers are scale-dependent.** ``eps`` (the flux an
+included reaction must carry, default 1.0) and ``prod_weight`` (default 0.5) only make
+sense when reaction bounds are ~±1000 and scores are O(1); the right values depend on
+the model's flux magnitudes and the score distribution. The upper gate uses each
+reaction's own ``ub`` as the big-M by default (adapts to the model); pass ``big_m`` to
+override with a fixed cap for a tighter LP relaxation. Calibration tables live in
+``docs/init_param_calibration.md``.
+"""
+from __future__ import annotations
+
+from collections.abc import Iterable, Mapping
+from dataclasses import dataclass
+
+import cobra
+from optlang.symbolics import Real, add, mul
+
+_EPS = 1.0 # flux an included reaction must carry (RAVEN's fake-met unit)
+
+
+@dataclass
+class _Directed:
+ """One directed reaction in the split (irreversible) problem."""
+
+ key: str
+ origin: str # original reaction id
+ coeffs: dict[str, float] # met id -> stoichiometry (already sign-adjusted)
+ ub: float
+ score: float
+ essential: bool
+
+
+@dataclass
+class InitResult:
+ """Result of :func:`run_init`."""
+
+ model: cobra.Model
+ deleted_reactions: list[str]
+ met_production: dict[str, bool] # present-met name -> producible?
+ objective: float
+
+
+def _split_reactions(
+ model: cobra.Model, scores: Mapping[str, float], essential: set[str]
+) -> list[_Directed]:
+ directed: list[_Directed] = []
+ for rxn in model.reactions:
+ score = float(scores.get(rxn.id, 0.0))
+ coeffs = {m.id: c for m, c in rxn.metabolites.items()}
+ rev_coeffs = {m: -c for m, c in coeffs.items()}
+ if rxn.id in essential:
+ # Force flux in a *single* direction (forward if it can run forward, else
+ # reverse) — like an irreversible essential reaction. Emitting both halves
+ # as essential would force fwd ≥ eps AND rev ≥ eps, i.e. a phantom
+ # eps-magnitude self-loop that can starve out the real pathway.
+ if rxn.upper_bound > 0:
+ directed.append(_Directed(rxn.id, rxn.id, coeffs, rxn.upper_bound, score, True))
+ else:
+ directed.append(_Directed(f"{rxn.id}__rev", rxn.id, rev_coeffs,
+ -rxn.lower_bound, score, True))
+ continue
+ if rxn.upper_bound > 0:
+ directed.append(_Directed(rxn.id, rxn.id, coeffs, rxn.upper_bound, score, False))
+ if rxn.lower_bound < 0: # reverse direction as its own non-negative flux
+ directed.append(
+ _Directed(f"{rxn.id}__rev", rxn.id, rev_coeffs, -rxn.lower_bound, score, False)
+ )
+ return directed
+
+
+def run_init(
+ model: cobra.Model,
+ rxn_scores: Mapping[str, float] | None = None,
+ *,
+ present_mets: Iterable[str] | None = None,
+ essential_rxns: Iterable[str] | None = None,
+ prod_weight: float = 0.5,
+ allow_excretion: bool = False,
+ no_rev_loops: bool = False,
+ eps: float = _EPS,
+ big_m: float | None = None,
+ mip_gap: float | None = None,
+ time_limit: float | None = None,
+) -> InitResult:
+ """Run the INIT MILP and return the extracted model.
+
+ ``rxn_scores`` maps reaction id → score (default 0). ``essential_rxns`` must be
+ kept (forced to carry flux). ``present_mets`` are metabolite *names* that the
+ network should be able to produce; each is tested and reported in
+ ``met_production``. See the module docstring for the formulation.
+
+ Note on score 0 (classic INIT vs. ftINIT divergence): in classic INIT a
+ reaction with score exactly 0 receives an include-indicator with **zero
+ reward**, so the optimiser is free to drop it. This matches RAVEN's
+ `runINIT` semantics. ftINIT inverts that — score-0 reactions stay in the
+ model unless they actively hurt feasibility — so a score of exactly 0
+ means *different things* in the two variants. If you want score-0
+ reactions kept here, pass a small positive value (e.g. ``min_score`` from
+ `gene_scores_from_expression`) instead of 0.
+ """
+ scores = dict(rxn_scores or {})
+ essential = set(essential_rxns or [])
+ present = list(present_mets or [])
+
+ directed = _split_reactions(model, scores, essential)
+ prob = model.problem
+ opt = prob.Model()
+
+ # Flux variables for every directed reaction.
+ flux = {d.key: prob.Variable(f"v_{d.key}", lb=0.0, ub=d.ub) for d in directed}
+
+ # Binary include-indicators for non-essential reactions; eps*x <= v <= ub*x.
+ keep: dict[str, object] = {}
+ gates = []
+ for d in directed:
+ if d.essential:
+ flux[d.key].lb = max(eps, 0.0) # forced to carry flux
+ continue
+ x = prob.Variable(f"x_{d.key}", type="binary")
+ keep[d.key] = x
+ cap = d.ub if big_m is None else big_m # big-M: per-reaction bound (default) or fixed
+ gates.append(prob.Constraint(flux[d.key] - cap * x, ub=0.0, name=f"ub_{d.key}"))
+ gates.append(prob.Constraint(flux[d.key] - eps * x, lb=0.0, name=f"lb_{d.key}"))
+
+ # no_rev_loops: at most one direction of a reversible reaction is included.
+ by_origin: dict[str, list[str]] = {}
+ for d in directed:
+ by_origin.setdefault(d.origin, []).append(d.key)
+ if no_rev_loops:
+ for keys in by_origin.values():
+ xs = [keep[k] for k in keys if k in keep]
+ if len(xs) > 1:
+ gates.append(prob.Constraint(sum(xs), ub=1.0, name=f"onedir_{keys[0]}"))
+
+ # Steady-state constraints S·v (- sink) {==0 | >=0}, plus prod_weight sinks.
+ # Accumulate each metabolite's terms by iterating reactions once (avoids the
+ # O(mets·rxns) per-metabolite filter) and sum with optlang.symbolics.add — Python
+ # sum() re-canonicalises a growing sympy expression each step (O(n²)), which is
+ # minutes per hub metabolite at genome scale.
+ met_terms: dict[str, list] = {met.id: [] for met in model.metabolites}
+ for d in directed:
+ v = flux[d.key]
+ for mid, coeff in d.coeffs.items():
+ met_terms[mid].append(mul([Real(coeff), v]))
+
+ sinks: dict[str, object] = {}
+ met_constraints: dict[str, object] = {}
+ ub = None if allow_excretion else 0.0
+ for met in model.metabolites:
+ terms = met_terms[met.id]
+ if prod_weight != 0:
+ s = prob.Variable(f"s_{met.id}", lb=0.0, ub=1.0)
+ sinks[met.id] = s
+ terms = [*terms, mul([Real(-1.0), s])] # net production drained into rewarded sink
+ if terms:
+ met_constraints[met.id] = prob.Constraint(add(terms), lb=0.0, ub=ub)
+
+ opt.add(list(flux.values()) + list(keep.values()) + list(sinks.values())
+ + gates + list(met_constraints.values()))
+
+ objective = prob.Objective(
+ add([mul([Real(d.score), keep[d.key]]) for d in directed if d.key in keep]
+ + [mul([Real(prod_weight), s]) for s in sinks.values()]),
+ direction="max",
+ )
+ opt.objective = objective
+
+ met_production = _check_present_mets(prob, present, model, directed, allow_excretion)
+
+ if time_limit is not None:
+ opt.configuration.timeout = int(time_limit)
+ if mip_gap is not None:
+ try: # Gurobi-specific; harmless if the backend differs
+ opt.problem.Params.MIPGap = mip_gap
+ except Exception: # noqa: BLE001
+ pass
+ opt.optimize()
+ # With a MIP gap / time limit set, accept a near-optimal incumbent (as RAVEN does).
+ if opt.status not in ("optimal", "feasible", "suboptimal", "time_limit"):
+ raise RuntimeError(f"INIT MILP did not solve (status: {opt.status}).")
+
+ # A reaction is kept if any of its directed parts is essential or has x≈1.
+ kept_origins = {d.origin for d in directed if d.essential}
+ kept_origins |= {d.origin for d in directed if d.key in keep and (keep[d.key].primal or 0) > 0.5}
+ deleted = [r.id for r in model.reactions if r.id not in kept_origins]
+
+ out = model.copy()
+ out.remove_reactions(deleted, remove_orphans=True)
+ return InitResult(out, sorted(deleted), met_production, float(opt.objective.value))
+
+
+def _check_present_mets(prob, present, model, directed, allow_excretion) -> dict[str, bool]:
+ """Whether each present metabolite (by name) can be net-produced at all.
+
+ A small LP per metabolite (no score/binary, so it's the LP relaxation, as RAVEN
+ does): all reactions available, steady state, and a demand draining ≥1 unit of
+ any compartment form of the metabolite — feasible ⇔ producible.
+ """
+ if not present:
+ return {}
+ name_to_ids: dict[str, list[str]] = {}
+ for met in model.metabolites:
+ name_to_ids.setdefault((met.name or met.id).upper(), []).append(met.id)
+
+ result: dict[str, bool] = {}
+ for name in present:
+ ids = name_to_ids.get(name.upper())
+ if not ids:
+ result[name] = False
+ continue
+ lp = prob.Model()
+ flux = {d.key: prob.Variable(f"v_{d.key}", lb=0.0, ub=d.ub) for d in directed}
+ drains = {mid: prob.Variable(f"drain_{mid}", lb=0.0, ub=1e6) for mid in ids}
+ terms: dict[str, list] = {met.id: [] for met in model.metabolites}
+ for d in directed:
+ v = flux[d.key]
+ for mid, c in d.coeffs.items():
+ terms[mid].append(mul([Real(c), v]))
+ for mid in drains:
+ terms[mid].append(mul([Real(-1.0), drains[mid]]))
+ cons = [prob.Constraint(add(t), lb=0.0, ub=None if allow_excretion else 0.0)
+ for t in terms.values() if t]
+ require = prob.Constraint(add(list(drains.values())), lb=1.0, name="_require_production")
+ lp.add(list(flux.values()) + list(drains.values()) + cons + [require])
+ lp.objective = prob.Objective(prob.Variable("_zero", lb=0, ub=0), direction="max")
+ lp.optimize()
+ result[name] = lp.status == "optimal"
+ return result
diff --git a/src/raven_python/init/merge.py b/src/raven_python/init/merge.py
new file mode 100644
index 0000000..a26f41c
--- /dev/null
+++ b/src/raven_python/init/merge.py
@@ -0,0 +1,226 @@
+"""Linear reaction merging for ftINIT.
+
+ftINIT shrinks the MILP losslessly by **contracting linear reaction chains**: a
+metabolite that appears in exactly two reactions (one net producer, one net consumer)
+links them into a single combined reaction. Iterating this collapses unbranched
+pathways — on Human-GEM ~12k → ~8k reactions, a ~⅓ smaller MILP — without changing
+the feasible flux space. Reversible reactions may merge too (unlike
+``simplifyModel``'s merge), which is why ftINIT ships its own.
+
+:func:`merge_linear` returns the reduced model plus the bookkeeping needed to map
+scores and results back to the original reactions:
+
+* ``group_ids`` — one integer per original reaction; ``0`` = not merged, equal
+ non-zero integers = merged into the same combined reaction (which keeps one
+ member's id).
+* ``reversed_rxns`` — which originals were flipped (their stored direction negated)
+ when oriented for merging; needed to map fluxes/directions back.
+
+:func:`group_rxn_scores` then sums the original per-reaction scores over each group,
+with RAVEN's zero-handling (see its docstring): genuine 0 → 0.01, ignore-masked → 0,
+a group cancelling to 0 with non-zero members → 0.01 — all so the MILP never sees an
+exactly-zero score (whose on/off would be arbitrary).
+"""
+from __future__ import annotations
+
+import math
+from collections import defaultdict
+from collections.abc import Iterable, Mapping
+
+import cobra
+
+_TOL = 1e-12
+
+
+class _Rxn:
+ """Mutable working reaction during the merge."""
+
+ __slots__ = ("id", "name", "coeffs", "lb", "ub")
+
+ def __init__(self, rid, name, coeffs, lb, ub):
+ self.id, self.name, self.coeffs, self.lb, self.ub = rid, name, coeffs, lb, ub
+
+ @property
+ def reversible(self) -> bool: # RAVEN's rev flag ≡ a negative lower bound
+ return self.lb < 0
+
+
+def merge_linear(
+ model: cobra.Model, no_merge: Iterable[str] = ()
+) -> tuple[cobra.Model, list[str], list[int], list[bool]]:
+ """Merge linearly-dependent reactions; return ``(reduced, orig_ids, group_ids, reversed)``.
+
+ ``no_merge`` reaction ids are never merged. The reduced model carries no genes
+ (merging makes GPRs meaningless); scores are remapped with
+ :func:`group_rxn_scores`.
+
+ Each pass recomputes the metabolite→reaction incidence fresh, then merges over the
+ degree-2 metabolites found at the start of the pass. A metabolite that only
+ *becomes* degree-2 mid-pass (because one of its reactions was just merged into a
+ survivor) is therefore picked up on the next pass rather than immediately — linear
+ merging is confluent, so the final grouping is the same regardless, it just takes a
+ few extra passes on long chains. (RAVEN re-finds incidence per metabolite and so
+ finishes a chain in one pass; the end result is equivalent.)
+ """
+ banned = set(no_merge)
+ orig_ids = [r.id for r in model.reactions]
+ group_of: dict[str, int] = {rid: 0 for rid in orig_ids}
+ reversed_of: dict[str, bool] = {rid: False for rid in orig_ids}
+ next_group = 1
+
+ rxns = [
+ _Rxn(r.id, r.name, {m.id: c for m, c in r.metabolites.items()},
+ r.lower_bound, r.upper_bound)
+ for r in model.reactions
+ ]
+
+ def flip(rx: _Rxn) -> None:
+ rx.coeffs = {m: -c for m, c in rx.coeffs.items()}
+ rx.lb, rx.ub = -rx.ub, -rx.lb
+ grp = group_of[rx.id]
+ targets = [o for o in orig_ids if group_of[o] == grp] if grp else [rx.id]
+ for o in targets:
+ reversed_of[o] = not reversed_of[o]
+
+ def relabel(rx: _Rxn, grp: int) -> None:
+ old = group_of[rx.id]
+ if old == grp:
+ return
+ if old == 0:
+ group_of[rx.id] = grp
+ else:
+ for o in orig_ids:
+ if group_of[o] == old:
+ group_of[o] = grp
+
+ while True:
+ incidence: dict[str, list[int]] = defaultdict(list)
+ for i, rx in enumerate(rxns):
+ for m in rx.coeffs:
+ incidence[m].append(i)
+ degree2 = [m for m, ii in incidence.items() if len(ii) == 2]
+
+ merged_some = False
+ for met in degree2:
+ involved = [i for i in incidence[met] if met in rxns[i].coeffs]
+ if len(involved) != 2:
+ continue # one side already merged away this pass
+ a, b = involved
+ if rxns[a].id in banned or rxns[b].id in banned:
+ continue
+ ca, cb = rxns[a].coeffs[met], rxns[b].coeffs[met]
+ ra, rb = rxns[a].reversible, rxns[b].reversible
+ pos = (ca > 0 or ra) + (cb > 0 or rb)
+ neg = (ca < 0 or ra) + (cb < 0 or rb)
+ if pos < 1 or neg < 1:
+ continue # need one producer and one consumer
+
+ r1, r2 = a, b
+ # Special case: rev producer first, irrev producer second → swap (RAVEN l.74).
+ if rxns[r1].reversible and not rxns[r2].reversible \
+ and rxns[r1].coeffs[met] > 0 and rxns[r2].coeffs[met] > 0:
+ r1, r2 = r2, r1
+ # Make r1 the producer of `met`.
+ if rxns[r1].coeffs[met] < 0:
+ if rxns[r2].coeffs[met] > 0:
+ r1, r2 = r2, r1
+ elif rxns[r1].reversible:
+ flip(rxns[r1])
+ elif rxns[r2].reversible:
+ flip(rxns[r2])
+ r1, r2 = r2, r1
+ else:
+ raise RuntimeError("mergeLinear: no producer orientation possible.")
+ # Make r2 the consumer.
+ if rxns[r2].coeffs[met] > 0:
+ if rxns[r2].reversible:
+ flip(rxns[r2])
+ else:
+ raise RuntimeError("mergeLinear: no consumer orientation possible.")
+
+ ratio = abs(rxns[r1].coeffs[met] / rxns[r2].coeffs[met])
+ merged = defaultdict(float, rxns[r1].coeffs)
+ for m, c in rxns[r2].coeffs.items():
+ merged[m] += c * ratio
+ merged[met] = 0.0
+ rxns[r1].coeffs = {m: c for m, c in merged.items() if abs(c) > _TOL}
+
+ # Most-constraining bounds win (RAVEN scales r2's bounds by the ratio).
+ if not math.isinf(rxns[r2].lb):
+ rxns[r1].lb = max(rxns[r1].lb, rxns[r2].lb / ratio)
+ if not math.isinf(rxns[r2].ub):
+ rxns[r1].ub = min(rxns[r1].ub, rxns[r2].ub / ratio)
+ rxns[r2].coeffs = {} # cleared → removed after the pass
+
+ grp = max(group_of[rxns[r1].id], group_of[rxns[r2].id]) or next_group
+ if grp == next_group:
+ next_group += 1
+ relabel(rxns[r1], grp)
+ relabel(rxns[r2], grp)
+ merged_some = True
+
+ if not merged_some:
+ break
+ rxns = [rx for rx in rxns if rx.coeffs]
+
+ return _build_model(model, rxns), orig_ids, [group_of[o] for o in orig_ids], \
+ [reversed_of[o] for o in orig_ids]
+
+
+def _build_model(template: cobra.Model, rxns: list[_Rxn]) -> cobra.Model:
+ """Assemble the reduced cobra model (gene-free) from the merged working reactions."""
+ reduced = cobra.Model(template.id)
+ used = {m for rx in rxns for m in rx.coeffs}
+ reduced.add_metabolites([
+ cobra.Metabolite(m.id, name=m.name, compartment=m.compartment, formula=m.formula)
+ for m in template.metabolites if m.id in used # template order preserved
+ ])
+ new_rxns = []
+ for rx in rxns:
+ r = cobra.Reaction(rx.id, name=rx.name, lower_bound=rx.lb, upper_bound=rx.ub)
+ new_rxns.append(r)
+ reduced.add_reactions(new_rxns)
+ for rx, r in zip(rxns, new_rxns, strict=True):
+ r.add_metabolites({reduced.metabolites.get_by_id(m): c for m, c in rx.coeffs.items()})
+ return reduced
+
+
+def group_rxn_scores(
+ reduced_model: cobra.Model,
+ orig_scores: Mapping[str, float],
+ orig_rxn_ids: list[str],
+ group_ids: list[int],
+ to_zero: Iterable[str] = (),
+) -> dict[str, float]:
+ """Sum original reaction scores over merged groups (RAVEN ``groupRxnScores``).
+
+ ``orig_scores`` maps original reaction id → score; ``to_zero`` are reactions to
+ drop from the problem (the ``toIgnore`` masks) — their score becomes 0. Genuine
+ zeros and groups cancelling to zero become 0.01 so the MILP never sees an exactly
+ zero score. Returns ``{reduced_reaction_id: score}``.
+ """
+ zero = set(to_zero)
+ group_of = dict(zip(orig_rxn_ids, group_ids, strict=True))
+ # Per-original adjusted score: genuine 0 → 0.01, then ignore-masked → 0.
+ adj: dict[str, float] = {}
+ for rid in orig_rxn_ids:
+ s = float(orig_scores.get(rid, 0.0))
+ s = 0.01 if s == 0.0 else s
+ adj[rid] = 0.0 if rid in zero else s
+ members: dict[int, list[str]] = defaultdict(list)
+ for rid in orig_rxn_ids:
+ if group_of[rid] != 0: # only merged groups need member lists
+ members[group_of[rid]].append(rid)
+
+ scores: dict[str, float] = {}
+ for r in reduced_model.reactions:
+ grp = group_of[r.id]
+ if grp == 0: # unmerged: keep the reaction's own (adjusted) score
+ scores[r.id] = adj[r.id]
+ else:
+ group = members[grp]
+ total = sum(adj[m] for m in group)
+ if total == 0.0 and any(adj[m] != 0.0 for m in group):
+ total = 0.01 # cancelled to zero but had non-zero members
+ scores[r.id] = total
+ return scores
diff --git a/src/raven_python/init/prep.py b/src/raven_python/init/prep.py
new file mode 100644
index 0000000..8ed4b89
--- /dev/null
+++ b/src/raven_python/init/prep.py
@@ -0,0 +1,241 @@
+"""ftINIT preprocessing — once-per-template work shared by every sample on a model.
+
+ftINIT does all omics-independent work once: classify reactions into the categories
+the staged MILP may *ignore* (leave in, never remove), discover task-essential
+reactions, linearly merge, and scale. The result (:class:`PrepData`) is reused across
+every sample.
+
+:func:`classify_reactions` is the reaction taxonomy: exchange, GPR-less
+import / simple / advanced transport, spontaneous, GPR-less extracellular, custom, and
+"any without a GPR". The staged schedule (:func:`raven_python.init.get_init_steps`) selects
+which categories to keep out of each MILP step via an 8-bit pattern.
+"""
+from __future__ import annotations
+
+from collections.abc import Iterable
+from dataclasses import dataclass, field
+
+import cobra
+
+from raven_python.init.merge import merge_linear
+from raven_python.tasks import Task, find_task_essential_reactions
+
+
+@dataclass
+class ReactionMasks:
+ """Reaction-category id sets (RAVEN's ``toIgnore*``), in 8-bit-pattern order.
+
+ ``ignored(pattern)`` returns the union of the categories whose bit is set — the
+ reactions held out of (left untouched by) that MILP step.
+ """
+
+ exchange: set[str] = field(default_factory=set) # b1
+ import_rxns: set[str] = field(default_factory=set) # b2
+ simple_transport: set[str] = field(default_factory=set) # b3
+ advanced_transport: set[str] = field(default_factory=set) # b4
+ spontaneous: set[str] = field(default_factory=set) # b5
+ extracellular: set[str] = field(default_factory=set) # b6 (no-GPR, all mets in ext comp)
+ custom: set[str] = field(default_factory=set) # b7
+ no_gpr: set[str] = field(default_factory=set) # b8
+
+ def _ordered(self) -> list[set[str]]:
+ return [self.exchange, self.import_rxns, self.simple_transport,
+ self.advanced_transport, self.spontaneous, self.extracellular,
+ self.custom, self.no_gpr]
+
+ def ignored(self, pattern: Iterable[int]) -> set[str]:
+ out: set[str] = set()
+ for bit, group in zip(pattern, self._ordered(), strict=True):
+ if bit:
+ out |= group
+ return out
+
+
+def _is_advanced_transport(rxn: cobra.Reaction) -> bool:
+ """Even number (>2) of mets pairing up by name across compartments with canceling stoich."""
+ mets = list(rxn.metabolites.items())
+ if len(mets) <= 2 or len(mets) % 2 != 0:
+ return False
+ remaining = [(m.name, m.compartment, c) for m, c in mets]
+ while remaining:
+ name, comp, coeff = remaining[0]
+ matches = [i for i in range(1, len(remaining)) if remaining[i][0] == name]
+ if len(matches) != 1:
+ return False
+ j = matches[0]
+ if coeff + remaining[j][2] != 0 or comp == remaining[j][1]:
+ return False
+ remaining = [r for k, r in enumerate(remaining) if k not in (0, j)]
+ return True
+
+
+def classify_reactions(
+ model: cobra.Model,
+ *,
+ ext_comp: str = "e",
+ spontaneous: Iterable[str] = (),
+ custom: Iterable[str] = (),
+) -> ReactionMasks:
+ """Classify reactions into the ftINIT ``toIgnore`` categories (``prepINITModel``).
+
+ ``ext_comp`` is the extracellular compartment. ``spontaneous``/``custom`` are
+ reaction-id lists. A reaction is "GPR-less" when its gene rule is empty.
+ """
+ spont, cust = set(spontaneous), set(custom)
+ masks = ReactionMasks(
+ exchange={r.id for r in model.boundary},
+ spontaneous={r.id for r in model.reactions if r.id in spont},
+ custom={r.id for r in model.reactions if r.id in cust},
+ no_gpr={r.id for r in model.reactions if not r.gene_reaction_rule.strip()},
+ )
+ for rxn in model.reactions:
+ if rxn.gene_reaction_rule.strip():
+ continue # transport categories are GPR-less only
+ mets = list(rxn.metabolites)
+ if len(mets) == 2:
+ (m1, m2) = mets
+ if m1.compartment != m2.compartment and m1.name == m2.name:
+ if ext_comp in (m1.compartment, m2.compartment):
+ masks.import_rxns.add(rxn.id)
+ else:
+ masks.simple_transport.add(rxn.id)
+ elif _is_advanced_transport(rxn):
+ masks.advanced_transport.add(rxn.id)
+ if len(mets) > 1 and all(m.compartment == ext_comp for m in mets):
+ masks.extracellular.add(rxn.id)
+ return masks
+
+
+@dataclass
+class PrepData:
+ """One-time ftINIT preprocessing of a template model (RAVEN ``prepData``).
+
+ Built once per template, reused across samples. ``min_model`` is the merged model
+ the MILP runs on; ``orig_rxn_ids``/``group_ids`` map its reactions back to the
+ ``ref_model`` (the simplified, pre-merge reference). ``essential_rxns`` are in
+ **merged** ids and pre-oriented irreversibly (so the MILP forces flux *forward*).
+ ``masks`` is on ``ref_model`` (= original) ids.
+ """
+
+ ref_model: cobra.Model
+ min_model: cobra.Model
+ orig_rxn_ids: list[str]
+ group_ids: list[int]
+ reversed_rxns: list[bool]
+ masks: ReactionMasks
+ essential_rxns: set[str] = field(default_factory=set)
+ essential_mets_for_tasks: set[str] = field(default_factory=set)
+ tasks: list[Task] = field(default_factory=list)
+
+ @property
+ def group_of(self) -> dict[str, int]:
+ return dict(zip(self.orig_rxn_ids, self.group_ids, strict=True))
+
+
+def rescale_for_init(model: cobra.Model, max_stoich_diff: float = 25.0) -> None:
+ """Compress each reaction's stoichiometric dynamic range.
+
+ Large spreads in stoichiometric coefficients (e.g. a biomass/pool reaction with
+ coefficients from 1e-3 to 1e2) force correspondingly extreme flux magnitudes, so no
+ single MILP big-M fits all reactions. RAVEN, per reaction: caps every ``|coeff|`` at
+ ``max_stoich_diff × min|coeff|`` (keeping signs), then scales the whole reaction so its
+ mean ``|coeff|`` is 1. Bounds are reset to ``±1000`` afterwards. Modifies ``model`` in
+ place; only the merged MILP model is scaled (the final output maps back to the
+ unscaled ``ref_model`` by reaction id, so reaction *selection* is unaffected).
+ """
+ for rxn in model.reactions:
+ items = list(rxn.metabolites.items())
+ if not items:
+ continue
+ cap = max_stoich_diff * min(abs(c) for _, c in items)
+ capped = {m: ((cap if c > 0 else -cap) if abs(c) > cap else c) for m, c in items}
+ total = sum(abs(c) for c in capped.values())
+ scale = (len(capped) / total) if total else 1.0
+ rxn.add_metabolites({m: c * scale for m, c in capped.items()}, combine=False)
+ for rxn in model.reactions: # RAVEN resets bounds to the standard ±1000 after scaling
+ if rxn.upper_bound > 0:
+ rxn.upper_bound = 1000.0
+ if rxn.lower_bound < 0:
+ rxn.lower_bound = -1000.0
+
+
+def _orient_forward(rxn: cobra.Reaction, direction: int) -> None:
+ """Make ``rxn`` carry flux only in its forced direction (irreversible forward)."""
+ if direction < 0: # flip so the forced (reverse) direction becomes forward
+ rxn.add_metabolites({m: -2 * c for m, c in rxn.metabolites.items()})
+ rxn.bounds = (-rxn.upper_bound, -rxn.lower_bound)
+ rxn.lower_bound = max(rxn.lower_bound, 0.0)
+
+
+def prep_init_model(
+ template: cobra.Model,
+ tasks: Iterable[Task] | None = None,
+ *,
+ ext_comp: str = "e",
+ spontaneous: Iterable[str] = (),
+ custom: Iterable[str] = (),
+ essential_cache_path=None,
+ scale: bool = True,
+) -> PrepData:
+ """Build :class:`PrepData` from a template model — the once-per-template work shared
+ by every ftINIT sample on this model.
+
+ With ``tasks``, discovers the task-essential reactions (kept regardless of score),
+ orients them irreversibly in their required direction, and drops tasks that are
+ infeasible. Then classifies reactions into the omics-independent categories, linearly
+ merges, and (unless ``scale=False``) rescales the merged model's stoichiometry
+ (:func:`rescale_for_init`) so a single MILP big-M is valid across all reactions —
+ without this, genome-scale ftINIT is infeasible / intractable.
+
+ ``essential_cache_path`` makes the (slow, genome-scale) essential-reaction discovery
+ **resumable** across interruptions — see :func:`find_task_essential_reactions`.
+ """
+ ref_model = template.copy()
+
+ essential_pre: dict[str, int] = {}
+ task_mets: set[str] = set()
+ kept_tasks: list[Task] = []
+ if tasks is not None:
+ tasks = list(tasks)
+ ess = find_task_essential_reactions(ref_model, tasks, cache_path=essential_cache_path)
+ essential_pre = ess.reactions
+ task_mets = ess.task_metabolites
+ kept_tasks = [t for t in tasks if t.id not in ess.failed_tasks]
+
+ # Orient essentials irreversibly (forced direction → forward) before merging, so
+ # the merge keeps them forward and the MILP forces them with a simple lower bound.
+ for rid, direction in essential_pre.items():
+ _orient_forward(ref_model.reactions.get_by_id(rid), direction)
+
+ masks = classify_reactions(ref_model, ext_comp=ext_comp,
+ spontaneous=spontaneous, custom=custom)
+
+ min_model, orig_ids, group_ids, reversed_rxns = merge_linear(ref_model)
+ if scale: # compress stoichiometric dynamic range so the MILP big-M fits all reactions
+ rescale_for_init(min_model)
+ group_of = dict(zip(orig_ids, group_ids, strict=True))
+
+ # Map essentials to the merged model: the survivor of each group containing an
+ # essential (or the reaction itself if unmerged). All are forward after orientation.
+ # An essential that merged into a group which collapsed away (e.g. a trivial
+ # source→sink chain) has no survivor and imposes no constraint — skip it.
+ survivor_by_group = {group_of[r.id]: r.id for r in min_model.reactions if group_of[r.id]}
+ essential_merged: set[str] = set()
+ for rid in essential_pre:
+ gid = group_of[rid]
+ if gid == 0:
+ essential_merged.add(rid)
+ elif gid in survivor_by_group:
+ essential_merged.add(survivor_by_group[gid])
+
+ return PrepData(
+ ref_model=ref_model,
+ min_model=min_model,
+ orig_rxn_ids=orig_ids,
+ group_ids=group_ids,
+ reversed_rxns=reversed_rxns,
+ masks=masks,
+ essential_rxns=essential_merged,
+ essential_mets_for_tasks=task_mets,
+ tasks=kept_tasks,
+ )
diff --git a/src/raven_python/init/score.py b/src/raven_python/init/score.py
new file mode 100644
index 0000000..6e14f86
--- /dev/null
+++ b/src/raven_python/init/score.py
@@ -0,0 +1,86 @@
+"""Score reactions from gene scores via the GPR.
+
+Maps per-gene scores (e.g. expression-derived: present → positive, absent → negative)
+to per-reaction scores by walking each reaction's GPR: genes joined by **OR**
+(isozymes) are combined with ``isozyme_scoring`` (default ``max``); genes joined by
+**AND** (complexes) with ``complex_scoring`` (default ``min``). Genes missing from
+``gene_scores`` are *omitted*; a reaction with no genes — or whose genes are all
+missing — gets ``no_gene_score`` (default −2). These reaction scores feed
+:func:`raven_python.init.run_init` and :func:`raven_python.init.ftinit`.
+
+Upstream — the omics-data → gene-score step (thresholding, expression levels) — lives
+in :mod:`raven_python.omics`; this function takes gene scores as given.
+"""
+from __future__ import annotations
+
+import ast
+import math
+import statistics
+from collections.abc import Mapping
+
+import cobra
+
+_AGG = {"min": min, "max": max, "median": statistics.median, "average": statistics.fmean}
+
+
+def gene_scores_from_expression(
+ expression: Mapping[str, float],
+ reference: Mapping[str, float] | float,
+ *,
+ factor: float = 5.0,
+ max_score: float = 10.0,
+ min_score: float = -5.0,
+) -> dict[str, float]:
+ """Gene scores from RNA-seq/array expression, RAVEN's ``5·ln(level/reference)``.
+
+ This is tINIT's usual entry point (RNA-seq is the common case; single-cell and
+ HPA are alternative upstream sources). ``reference`` is either a per-gene
+ reference level (e.g. the cross-sample mean) or a single threshold for all genes:
+ a gene expressed above its reference scores positive, below it negative. The
+ score is clamped to ``[min_score, max_score]``; non-positive level/reference (and
+ missing reference) → ``min_score`` (RAVEN maps these NaNs to -5).
+ """
+ scalar = isinstance(reference, (int, float))
+ scores: dict[str, float] = {}
+ for gene, level in expression.items():
+ ref = reference if scalar else reference.get(gene)
+ if not level or not ref or level <= 0 or ref <= 0:
+ scores[gene] = min_score
+ else:
+ scores[gene] = max(min(factor * math.log(level / ref), max_score), min_score)
+ return scores
+
+
+def _score_node(node, gene_scores: Mapping[str, float], iso, cplx) -> float | None:
+ if isinstance(node, ast.Name):
+ return gene_scores.get(node.id) # None if the gene has no score
+ if isinstance(node, ast.BoolOp):
+ agg = iso if isinstance(node.op, ast.Or) else cplx
+ vals = [s for v in node.values if (s := _score_node(v, gene_scores, iso, cplx)) is not None]
+ return agg(vals) if vals else None
+ return None
+
+
+def score_reactions_from_genes(
+ model: cobra.Model,
+ gene_scores: Mapping[str, float],
+ *,
+ isozyme_scoring: str = "max",
+ complex_scoring: str = "min",
+ no_gene_score: float = -2.0,
+) -> dict[str, float]:
+ """Return ``{reaction_id: score}`` from per-gene scores via each reaction's GPR."""
+ for name, value in (("isozyme_scoring", isozyme_scoring), ("complex_scoring", complex_scoring)):
+ if value not in _AGG:
+ raise ValueError(f"{name} must be one of {sorted(_AGG)}; got {value!r}.")
+ iso, cplx = _AGG[isozyme_scoring], _AGG[complex_scoring]
+
+ scores: dict[str, float] = {}
+ for rxn in model.reactions:
+ body = rxn.gpr.body
+ if body is None or not rxn.genes:
+ scores[rxn.id] = no_gene_score
+ else:
+ value = _score_node(body, gene_scores, iso, cplx)
+ scores[rxn.id] = no_gene_score if value is None else float(value)
+ return scores
diff --git a/src/raven_python/init/steps.py b/src/raven_python/init/steps.py
new file mode 100644
index 0000000..d8a7b86
--- /dev/null
+++ b/src/raven_python/init/steps.py
@@ -0,0 +1,62 @@
+"""ftINIT step schedule.
+
+ftINIT runs as a short sequence of MILP steps instead of one big MILP. Each step
+(:class:`InitStep`) chooses which reaction categories to hold out of the problem
+(``ignore_mask``, an 8-bit pattern over :class:`raven_python.init.ReactionMasks`), whether
+to drop positive reversibles and allow metabolite secretion, and how to treat the
+reactions turned on by previous steps (``'ignore'`` for the first step, ``'essential'``
+to fix them on). :func:`get_init_steps` builds the standard schedules.
+
+The default ``'1+1'`` is two steps: step 1 decides only the GPR-associated reactions
+(everything GPR-less is held out); step 2 brings the GPR-less transport / extracellular
+reactions in with step-1 reactions fixed as essential. ``'full'`` is the single-MILP
+classic-tINIT variant (nothing held out).
+"""
+from __future__ import annotations
+
+from collections.abc import Sequence
+from dataclasses import dataclass, field
+
+# 8-bit ignore patterns (exchange, import, simple-transp, adv-transp, spontaneous,
+# extracellular, custom, no-GPR) — see ReactionMasks.
+_ALL_NO_GPR_KEPT = (1, 1, 1, 1, 1, 1, 1, 0) # hold out every GPR-less category but "all no-GPR"
+_EXCH_SPONT = (1, 0, 0, 0, 1, 0, 0, 0) # hold out only exchange + spontaneous
+_NONE = (0, 0, 0, 0, 0, 0, 0, 0)
+
+
+@dataclass
+class InitStep:
+ """One ftINIT MILP step."""
+
+ how_to_use_prev: str = "essential" # 'ignore' | 'essential'
+ ignore_mask: tuple[int, ...] = _ALL_NO_GPR_KEPT
+ pos_rev_off: bool = False # drop positive reversibles from the problem
+ allow_met_secr: bool = False # relax S·v = 0 to ≥ 0
+ mets_to_ignore: Sequence[str] = field(default_factory=tuple) # met names zeroed from S (e.g. H2O)
+
+
+def get_init_steps(series: str = "1+1", *, mets_to_ignore: Sequence[str] = ()) -> list[InitStep]:
+ """Return the step schedule for a named ftINIT ``series`` (RAVEN ``getINITSteps``).
+
+ ``'1+1'`` (default, step 1+2 merged), ``'2+1'`` (3-step), ``'1+0'``/``'2+0'``
+ (skip the final GPR-less step), ``'full'`` (single MILP). ``mets_to_ignore`` are
+ metabolite names removed from the stoichiometry in each step (e.g. H2O, H+).
+ """
+ m = tuple(mets_to_ignore)
+ s1 = InitStep("ignore", _ALL_NO_GPR_KEPT, mets_to_ignore=m)
+ s1_posrev = InitStep("ignore", _ALL_NO_GPR_KEPT, pos_rev_off=True, allow_met_secr=True,
+ mets_to_ignore=m)
+ s2_all = InitStep("essential", _ALL_NO_GPR_KEPT, mets_to_ignore=m)
+ s_final = InitStep("essential", _EXCH_SPONT, mets_to_ignore=m)
+
+ if series == "1+1":
+ return [s1, s_final]
+ if series == "2+1":
+ return [s1_posrev, s2_all, s_final]
+ if series == "1+0":
+ return [s1]
+ if series == "2+0":
+ return [s1_posrev, s2_all]
+ if series == "full":
+ return [InitStep("ignore", _NONE, mets_to_ignore=m)]
+ raise ValueError(f"Unknown ftINIT series {series!r}; expected 1+1, 2+1, 1+0, 2+0, full.")
diff --git a/src/raven_python/init/taskfill.py b/src/raven_python/init/taskfill.py
new file mode 100644
index 0000000..58501ce
--- /dev/null
+++ b/src/raven_python/init/taskfill.py
@@ -0,0 +1,183 @@
+"""Task gap-filling for ftINIT.
+
+After ftINIT extracts a context-specific model, some metabolic tasks may no longer be
+feasible (the scoring removed reactions a task needs). :func:`fill_tasks` restores
+feasibility by adding back the **minimum-cost** set of reactions from the reference
+(template) model — cost = ``−score``, so high-scoring reactions are preferred — one
+task at a time, only for tasks that are actually infeasible (a cheap LP check gates
+the expensive MILP), accumulating additions across tasks.
+
+This is a different MILP from ftINIT's main extraction: it *adds* reactions to satisfy
+the task's ranged metabolite bounds (RAVEN's two-column ``b``), rather than selecting
+which to keep by expression score. Exchange reactions are not used to fill gaps (task
+inputs/outputs come from the task's ``b``), so they are excluded as candidates.
+"""
+from __future__ import annotations
+
+from collections.abc import Iterable, Mapping
+from dataclasses import dataclass
+
+import cobra
+from optlang.symbolics import Real, add, mul
+
+from raven_python.tasks import Task
+from raven_python.tasks.check import (
+ _metabolite_bounds,
+ _set_constraint_bounds,
+ apply_task_constraints,
+ task_name_maps,
+)
+
+_DEFAULT_SCORE = -1.0 # RAVEN: missing scores default to -1 (cost 1)
+_MAX_SCORE = -0.1 # RAVEN min(score, -0.1): every added reaction costs ≥ 0.1
+
+
+@dataclass
+class TaskFillResult:
+ """Result of :func:`fill_tasks`: the gap-filled model and what was added."""
+
+ model: cobra.Model
+ added_reactions: list[str]
+ failed_tasks: list[str]
+
+
+def _closed_copy(model: cobra.Model) -> cobra.Model:
+ """A copy with boundary reactions closed: task I/O comes only from the task's b."""
+ out = model.copy()
+ for rxn in out.boundary:
+ rxn.bounds = (0.0, 0.0)
+ return out
+
+
+def _feasible(model: cobra.Model, task: Task, name_to_id, comp_to_ids) -> bool:
+ """Is ``task`` feasible in ``model`` (boundaries closed)? Tested in place, then reverted.
+
+ Avoids copying the (genome-scale) model for each of the task list's feasibility checks
+ — the copy dominated gap-fill runtime. ``with model:`` reverts the closed boundaries and
+ everything ``apply_task_constraints`` does through cobra's API; the untracked direct
+ metabolite mass-balance bound edits are snapshotted and restored (as in check_tasks).
+ """
+ bounds, missing = _metabolite_bounds(task, name_to_id, comp_to_ids)
+ if missing:
+ return False
+ saved = {mid: (model.constraints[mid].lb, model.constraints[mid].ub) for mid in bounds}
+ try:
+ with model:
+ for rxn in model.boundary:
+ rxn.bounds = (0.0, 0.0)
+ _, error = apply_task_constraints(model, task, name_to_id, comp_to_ids)
+ if error is not None:
+ return False
+ model.slim_optimize()
+ return model.solver.status == "optimal"
+ finally:
+ for mid, (lb, ub) in saved.items():
+ _set_constraint_bounds(model.constraints[mid], lb, ub)
+
+
+def _fill_one_task(
+ model: cobra.Model, candidates: list[cobra.Reaction], task: Task,
+ costs: dict[str, float], *, mip_gap: float | None = None, time_limit: float | None = None,
+) -> list[str]:
+ """Min-cost set of ``candidates`` to make ``task`` feasible in ``model`` (the MILP).
+
+ ``mip_gap``/``time_limit`` bound this MILP (it has a binary per candidate reaction —
+ thousands). Unbounded, proving min-cost optimality is intractable when degraded input
+ has broken many tasks at once; a near-optimal fill (slightly more reactions) is the
+ right trade for tractability, exactly as for the main ftINIT MILP.
+ """
+ if not candidates: # nothing left to add → task cannot be made feasible
+ raise RuntimeError(f"gap-filling found no candidates for task {task.id!r}.")
+ combined = _closed_copy(model) # task I/O via the task's b, not the model's exchanges
+ combined.add_reactions([r.copy() for r in candidates])
+ name_to_id, comp_to_ids = task_name_maps(combined)
+ _, error = apply_task_constraints(combined, task, name_to_id, comp_to_ids)
+ if error is not None:
+ raise RuntimeError(f"task {task.id!r} could not be applied to the reference: {error}")
+
+ prob = combined.problem
+ extras = []
+ objective_terms = []
+ for cand in candidates:
+ rxn = combined.reactions.get_by_id(cand.id)
+ y = prob.Variable(f"_fill_{cand.id}", type="binary")
+ # off ⇒ no flux; on ⇒ the reaction's own bounds apply.
+ extras += [
+ y,
+ prob.Constraint(rxn.flux_expression - rxn.upper_bound * y, ub=0.0,
+ name=f"_fillub_{cand.id}"),
+ prob.Constraint(rxn.flux_expression - rxn.lower_bound * y, lb=0.0,
+ name=f"_filllb_{cand.id}"),
+ ]
+ objective_terms.append(mul([Real(costs[cand.id]), y]))
+ combined.add_cons_vars(extras)
+ # add() over a flat list, not Python sum() — the latter is O(n²) in sympy and with
+ # thousands of candidates dominates gap-fill runtime (see ftINIT/tINIT, same fix).
+ combined.objective = prob.Objective(add(objective_terms), direction="min")
+ if time_limit is not None:
+ combined.solver.configuration.timeout = int(time_limit)
+ if mip_gap is not None:
+ try: # Gurobi-specific; harmless if the backend differs
+ combined.solver.problem.Params.MIPGap = mip_gap
+ except Exception: # noqa: BLE001
+ pass
+ combined.slim_optimize()
+ # Accept a near-optimal incumbent (mip_gap/time_limit); only a truly infeasible fill
+ # (no incumbent) means the task cannot be satisfied from the reference.
+ if combined.solver.status not in ("optimal", "feasible", "suboptimal", "time_limit") or \
+ combined.variables[f"_fill_{candidates[0].id}"].primal is None:
+ raise RuntimeError(f"gap-filling found no way to make task {task.id!r} feasible.")
+ return [c.id for c in candidates
+ if (combined.variables[f"_fill_{c.id}"].primal or 0.0) > 0.5]
+
+
+def fill_tasks(
+ model: cobra.Model,
+ reference_model: cobra.Model,
+ tasks: Iterable[Task],
+ *,
+ rxn_scores: Mapping[str, float] | None = None,
+ mip_gap: float | None = None,
+ time_limit: float | None = None,
+) -> TaskFillResult:
+ """Add minimum-cost reference reactions so every task is feasible in ``model``.
+
+ ``reference_model`` supplies the candidate reactions (those not already in
+ ``model``, excluding exchange/boundary reactions). ``rxn_scores`` (original
+ reaction id → score) sets the cost of adding each candidate as ``−min(score,
+ −0.1)`` (missing → cost 1). Tasks already feasible are skipped; ``should_fail``
+ tasks are ignored. The model is carried forward, so later tasks see earlier
+ additions. Returns the gap-filled model and the reactions added.
+
+ Boundary reactions are closed while testing/solving each task, so task inputs and
+ outputs come solely from the task's ranged metabolite bounds (RAVEN gap-fills the
+ exchange-free model). The returned model keeps its boundary reactions.
+ """
+ scores = dict(rxn_scores or {})
+ tasks = list(tasks)
+ in_model = {r.id for r in model.reactions}
+ candidates = [r for r in reference_model.reactions
+ if r.id not in in_model and not r.boundary]
+ costs = {r.id: -min(scores.get(r.id, _DEFAULT_SCORE), _MAX_SCORE) for r in candidates}
+
+ out = model.copy()
+ added: list[str] = []
+ failed: list[str] = []
+ for task in tasks:
+ if task.should_fail:
+ continue
+ name_to_id, comp_to_ids = task_name_maps(out)
+ if _feasible(out, task, name_to_id, comp_to_ids):
+ continue
+ # Only offer reactions not yet in the (growing) model.
+ present = {r.id for r in out.reactions}
+ avail = [r for r in candidates if r.id not in present]
+ try:
+ chosen = _fill_one_task(out, avail, task, costs, mip_gap=mip_gap, time_limit=time_limit)
+ except RuntimeError:
+ failed.append(task.id)
+ continue
+ if chosen:
+ out.add_reactions([reference_model.reactions.get_by_id(c).copy() for c in chosen])
+ added.extend(chosen)
+ return TaskFillResult(out, added, failed)
diff --git a/src/raven_python/io/__init__.py b/src/raven_python/io/__init__.py
new file mode 100644
index 0000000..bc70511
--- /dev/null
+++ b/src/raven_python/io/__init__.py
@@ -0,0 +1,15 @@
+"""RAVEN-specific I/O: YAML (cobra + Metabolic Atlas / Human-GEM extensions), SIF,
+Excel export, and the Standard-GEM ``model//…`` git layout.
+"""
+from raven_python.io.excel import export_to_excel
+from raven_python.io.git import export_for_git
+from raven_python.io.sif import export_model_to_sif
+from raven_python.io.yaml import read_yaml_model, write_yaml_model
+
+__all__ = [
+ "export_for_git",
+ "export_model_to_sif",
+ "export_to_excel",
+ "read_yaml_model",
+ "write_yaml_model",
+]
diff --git a/src/raven_python/io/excel.py b/src/raven_python/io/excel.py
new file mode 100644
index 0000000..cf6196e
--- /dev/null
+++ b/src/raven_python/io/excel.py
@@ -0,0 +1,136 @@
+"""Export a model to the RAVEN Microsoft Excel format.
+
+Writes the five-sheet RAVEN xlsx layout — RXNS, METS, COMPS, GENES, MODEL — pulling
+RAVEN-specific values back out of cobra's ``annotation`` / ``notes`` (where the
+raven_python YAML reader stashes them). Excel *import* is intentionally not provided.
+
+Requires the optional ``openpyxl`` dependency (``pip install raven_python[excel]``).
+"""
+from __future__ import annotations
+
+from pathlib import Path
+
+import cobra
+
+
+def _miriam_string(annotation: dict, exclude: tuple[str, ...] = ()) -> str:
+ """RAVEN MIRIAM column: ``namespace/id;namespace/id2;...`` (sorted)."""
+ parts = []
+ for namespace in sorted(annotation):
+ if namespace in exclude:
+ continue
+ values = annotation[namespace]
+ if isinstance(values, str):
+ values = [values]
+ parts.extend(f"{namespace}/{value}" for value in values)
+ return ";".join(parts)
+
+
+def _equation(rxn: cobra.Reaction) -> str:
+ """Human-readable equation in RAVEN ``name[comp]`` form."""
+
+ def side(items):
+ return " + ".join(
+ f"{abs(coef):g} {met.name}[{met.compartment}]" for met, coef in items
+ )
+
+ reactants = [(m, c) for m, c in rxn.metabolites.items() if c < 0]
+ products = [(m, c) for m, c in rxn.metabolites.items() if c > 0]
+ arrow = " <=> " if rxn.reversibility else " => "
+ return f"{side(reactants)}{arrow}{side(products)}"
+
+
+def _ec_codes(rxn: cobra.Reaction) -> str:
+ codes = rxn.annotation.get("ec-code", [])
+ if isinstance(codes, str):
+ codes = [codes]
+ return ";".join(codes)
+
+
+def export_to_excel(
+ model: cobra.Model, path: str | Path, *, sort_ids: bool = False
+) -> None:
+ """Write ``model`` to a RAVEN-format ``.xlsx`` file.
+
+ Parameters
+ ----------
+ sort_ids
+ If True, write reactions/metabolites/genes sorted alphabetically by ID
+ (the model itself is not modified).
+ """
+ try:
+ from openpyxl import Workbook
+ except ImportError as exc: # pragma: no cover - exercised only without openpyxl
+ raise ImportError(
+ "export_to_excel requires openpyxl. Install it with "
+ "`pip install raven_python[excel]` (or `pip install openpyxl`)."
+ ) from exc
+
+ reactions = sorted(model.reactions, key=lambda r: r.id) if sort_ids else list(model.reactions)
+ metabolites = (
+ sorted(model.metabolites, key=lambda m: m.id) if sort_ids else list(model.metabolites)
+ )
+ genes = sorted(model.genes, key=lambda g: g.id) if sort_ids else list(model.genes)
+ metadata = dict(model.notes.get("metaData", {})) if model.notes else {}
+
+ wb = Workbook()
+ wb.remove(wb.active) # drop the default empty sheet
+
+ # --- RXNS ---
+ ws = wb.create_sheet("RXNS")
+ ws.append(
+ ["#", "ID", "NAME", "EQUATION", "EC-NUMBER", "GENE ASSOCIATION", "LOWER BOUND",
+ "UPPER BOUND", "OBJECTIVE", "COMPARTMENT", "MIRIAM", "SUBSYSTEM",
+ "REPLACEMENT ID", "NOTE", "REFERENCE", "CONFIDENCE SCORE"]
+ )
+ for r in reactions:
+ subsystem = r.subsystem
+ if isinstance(subsystem, (list, tuple)):
+ subsystem = ";".join(subsystem)
+ ws.append([
+ None, r.id, r.name, _equation(r), _ec_codes(r), r.gene_reaction_rule,
+ r.lower_bound, r.upper_bound,
+ r.objective_coefficient or None, None,
+ _miriam_string(r.annotation, exclude=("ec-code",)), subsystem, None,
+ r.notes.get("note"), r.notes.get("references"), r.notes.get("confidence_score"),
+ ])
+
+ # --- METS ---
+ ws = wb.create_sheet("METS")
+ ws.append(["#", "ID", "NAME", "UNCONSTRAINED", "MIRIAM", "COMPOSITION", "InChI",
+ "COMPARTMENT", "REPLACEMENT ID", "CHARGE"])
+ for m in metabolites:
+ inchi = m.notes.get("inchis")
+ ws.append([
+ None, f"{m.name}[{m.compartment}]", m.name, None,
+ _miriam_string(m.annotation, exclude=("smiles",)),
+ None if inchi else m.formula, inchi, m.compartment, m.id, m.charge,
+ ])
+
+ # --- COMPS ---
+ ws = wb.create_sheet("COMPS")
+ ws.append(["#", "ABBREVIATION", "NAME", "INSIDE", "MIRIAM"])
+ comps = sorted(model.compartments) if sort_ids else list(model.compartments)
+ for cid in comps:
+ ws.append([None, cid, model.compartments.get(cid, ""), None, None])
+
+ # --- GENES ---
+ if genes:
+ ws = wb.create_sheet("GENES")
+ ws.append(["#", "NAME", "MIRIAM", "SHORT NAME", "COMPARTMENT"])
+ for g in genes:
+ ws.append([None, g.id, _miriam_string(g.annotation), g.name, None])
+
+ # --- MODEL ---
+ ws = wb.create_sheet("MODEL")
+ ws.append(["#", "ID", "NAME", "TAXONOMY", "DEFAULT LOWER", "DEFAULT UPPER",
+ "CONTACT GIVEN NAME", "CONTACT FAMILY NAME", "CONTACT EMAIL",
+ "ORGANIZATION", "NOTES"])
+ ws.append([
+ None, model.id or "blankID", model.name or "blankName",
+ metadata.get("taxonomy"), metadata.get("defaultLB"), metadata.get("defaultUB"),
+ metadata.get("givenName"), metadata.get("familyName"), metadata.get("email"),
+ metadata.get("organization"), metadata.get("note"),
+ ])
+
+ wb.save(str(path))
diff --git a/src/raven_python/io/git.py b/src/raven_python/io/git.py
new file mode 100644
index 0000000..80bf8e8
--- /dev/null
+++ b/src/raven_python/io/git.py
@@ -0,0 +1,106 @@
+"""Export a model into a Standard-GEM versioned-repository layout.
+
+Writes the model in several formats into the Standard-GEM folder structure (a
+``model/`` directory with one subfolder per format), ready to commit to a
+Git-maintained model repository (Metabolic Atlas / Human-GEM / yeast-GEM style),
+plus a ``dependencies.txt`` recording tool versions.
+
+Thin orchestration over the writers raven_python already exposes: ``write_yaml_model``,
+cobra's ``write_sbml_model`` and ``save_matlab_model``, ``export_to_excel``, plus a
+single-file reaction table (txt).
+"""
+from __future__ import annotations
+
+import importlib.metadata as _md
+import platform
+from collections.abc import Iterable
+from pathlib import Path
+
+import cobra
+
+from raven_python.io.excel import _equation, export_to_excel
+from raven_python.io.yaml import write_yaml_model
+from raven_python.utils.sort import sort_identifiers
+
+_ALL_FORMATS = ("yml", "xml", "mat", "xlsx", "txt")
+
+
+def _version(package: str) -> str:
+ try:
+ return _md.version(package)
+ except _md.PackageNotFoundError:
+ return "unknown"
+
+
+def _write_txt(model: cobra.Model, path: Path) -> None:
+ """Single-file, human-readable reaction table (RAVEN exportForGit txt)."""
+ with open(path, "w", encoding="utf-8") as fh:
+ fh.write("Rxn name\tFormula\tGene-reaction association\tLB\tUB\tObjective\n")
+ for r in model.reactions:
+ fh.write(
+ f"{r.id}\t{_equation(r)}\t{r.gene_reaction_rule}\t"
+ f"{r.lower_bound:g}\t{r.upper_bound:g}\t{r.objective_coefficient:g}\n"
+ )
+
+
+def export_for_git(
+ model: cobra.Model,
+ path: str | Path = ".",
+ *,
+ prefix: str = "model",
+ formats: Iterable[str] = ("yml", "xml", "mat", "xlsx"),
+ sub_dirs: bool = True,
+) -> Path:
+ """Write ``model`` into a Standard-GEM repository layout.
+
+ Parameters
+ ----------
+ path
+ Directory to populate.
+ prefix
+ Base filename for every format (default ``"model"``).
+ formats
+ Which formats to write; any of ``"yml"``, ``"xml"``, ``"mat"``,
+ ``"xlsx"``, ``"txt"`` (default ``yml``/``xml``/``mat``/``xlsx``).
+ sub_dirs
+ If True (default), write ``model//.`` (standard-GEM
+ layout); otherwise all files go directly in ``path``.
+
+ Returns
+ -------
+ pathlib.Path
+ The root directory written to.
+ """
+ formats = list(formats)
+ unknown = set(formats) - set(_ALL_FORMATS)
+ if unknown:
+ raise ValueError(f"Unknown format(s): {sorted(unknown)}; allowed: {_ALL_FORMATS}")
+
+ # Sort a copy so the caller's model is untouched.
+ model = sort_identifiers(model.copy())
+
+ root = Path(path) / "model" if sub_dirs else Path(path)
+ root.mkdir(parents=True, exist_ok=True)
+
+ def target(fmt: str) -> Path:
+ folder = root / fmt if sub_dirs else root
+ folder.mkdir(parents=True, exist_ok=True)
+ return folder / f"{prefix}.{fmt}"
+
+ if "yml" in formats:
+ write_yaml_model(model, target("yml"))
+ if "xml" in formats:
+ cobra.io.write_sbml_model(model, str(target("xml")))
+ if "mat" in formats:
+ cobra.io.save_matlab_model(model, str(target("mat")))
+ if "xlsx" in formats:
+ export_to_excel(model, target("xlsx"))
+ if "txt" in formats:
+ _write_txt(model, target("txt"))
+
+ with open(root / "dependencies.txt", "w", encoding="utf-8") as fh:
+ fh.write(f"python\t{platform.python_version()}\n")
+ fh.write(f"cobra\t{_version('cobra')}\n")
+ fh.write(f"raven_python\t{_version('raven_python')}\n")
+
+ return root
diff --git a/src/raven_python/io/sif.py b/src/raven_python/io/sif.py
new file mode 100644
index 0000000..9e73efa
--- /dev/null
+++ b/src/raven_python/io/sif.py
@@ -0,0 +1,96 @@
+"""Export a model to Cytoscape SIF (Simple Interaction Format).
+
+Three graph types are supported:
+
+* ``"rc"`` reaction–compound: each reaction linked to its metabolites;
+* ``"rr"`` reaction–reaction: reactions linked when they share a metabolite;
+* ``"cc"`` compound–compound: each substrate linked to the products of the
+ reactions it feeds (computed on an irreversible copy, as RAVEN does, to avoid
+ spurious double links from reversible reactions).
+
+A SIF line is ``source graph_type target1 target2 ...``.
+"""
+from __future__ import annotations
+
+import warnings
+from collections import Counter
+from collections.abc import Mapping
+from pathlib import Path
+
+import cobra
+
+from raven_python.manipulation.irreversible import convert_to_irreversible
+
+_GRAPH_TYPES = ("rc", "rr", "cc")
+
+
+def _edges(model, graph_type):
+ """Yield (source_object, [target_objects]) per the graph type."""
+ if graph_type == "rc":
+ for rxn in model.reactions:
+ yield rxn, list(rxn.metabolites)
+ elif graph_type == "rr":
+ for rxn in model.reactions:
+ neighbours = {r for met in rxn.metabolites for r in met.reactions}
+ neighbours.discard(rxn)
+ yield rxn, list(neighbours)
+ else: # cc — on an irreversible copy
+ irrev = model.copy()
+ convert_to_irreversible(irrev)
+ for met in irrev.metabolites:
+ products: set = set()
+ for rxn in met.reactions:
+ if rxn.get_coefficient(met) < 0: # met is a substrate here
+ products.update(m for m, c in rxn.metabolites.items() if c > 0)
+ yield met, list(products)
+
+
+def export_model_to_sif(
+ model: cobra.Model,
+ path: str | Path,
+ graph_type: str = "rc",
+ *,
+ reaction_labels: Mapping[str, str] | None = None,
+ metabolite_labels: Mapping[str, str] | None = None,
+) -> None:
+ """Write ``model`` to a Cytoscape SIF file.
+
+ Parameters
+ ----------
+ graph_type
+ ``"rc"`` (reaction–compound, default), ``"rr"`` (reaction–reaction), or
+ ``"cc"`` (compound–compound).
+ reaction_labels, metabolite_labels
+ Optional ``{id: label}`` maps overriding the node labels (default: IDs).
+ """
+ if graph_type not in _GRAPH_TYPES:
+ raise ValueError(f"graph_type must be one of {_GRAPH_TYPES}, got {graph_type!r}")
+
+ rlabels = reaction_labels or {}
+ mlabels = metabolite_labels or {}
+
+ # Warn when the label maps collapse multiple distinct ids onto the same
+ # label: target-side dedup runs on labels, so the collision silently merges
+ # two nodes into one edge. Only check the ids actually mapped (cobra default
+ # labels are ids, which can't collide).
+ for kind, lmap in (("reaction", rlabels), ("metabolite", mlabels)):
+ duplicates = [lab for lab, n in Counter(lmap.values()).items() if n > 1]
+ if duplicates:
+ warnings.warn(
+ f"{kind}_labels maps multiple ids to the same label(s) "
+ f"({duplicates[:5]}{'…' if len(duplicates) > 5 else ''}); "
+ "SIF nodes are keyed by label, so those nodes will collapse.",
+ stacklevel=2,
+ )
+
+ def label(obj) -> str:
+ if isinstance(obj, cobra.Reaction):
+ return rlabels.get(obj.id, obj.id)
+ return mlabels.get(obj.id, obj.id)
+
+ with open(path, "w", encoding="utf-8") as handle:
+ for source, targets in _edges(model, graph_type):
+ src = label(source)
+ names = sorted({label(t) for t in targets} - {src})
+ if names:
+ handle.write(f"{src}\t{graph_type}\t" + "\t".join(names) + "\n")
diff --git a/src/raven_python/io/yaml.py b/src/raven_python/io/yaml.py
new file mode 100644
index 0000000..151954b
--- /dev/null
+++ b/src/raven_python/io/yaml.py
@@ -0,0 +1,191 @@
+"""Read and write RAVEN/cobrapy YAML models.
+
+Aligned to RAVEN ``writeYAMLmodel.m`` / ``readYAMLmodel.m`` as of the
+``feat/geckopy-compat-yaml`` work (commit fa281a1), whose writer emits **cobra's
+native ``!!omap`` YAML**. Because the format *is* cobra's, the standard model
+content — id, name, compartments, and per-entry id/name/compartment/formula/
+charge/bounds/gene_reaction_rule/objective_coefficient/subsystem/metabolites and
+the whole ``annotation`` block (which carries ``smiles`` for metabolites,
+``ec-code`` for reactions, and all MIRIAM cross-references) — is read and written
+by ``cobra.io`` directly.
+
+This module only handles what cobra drops or mishandles:
+
+* **RAVEN-only top-level per-entry keys** that cobra ignores: ``inchis``,
+ ``deltaG``, ``metFrom`` and the free-text ``notes`` (metNotes) on metabolites;
+ ``confidence_score``, ``references``, ``rxnFrom``, ``deltaG`` and ``notes``
+ (rxnNotes) on reactions; ``protein`` on genes. These are stashed in the cobra
+ object's ``.notes`` dict on read and lifted back to top-level keys on write.
+* **Model-level extras** cobra ignores: ``version``, the ``metaData`` provenance
+ block, and the GECKO sections (``gecko_light``/``ec-rxns``/``ec-enzymes``),
+ preserved on ``model.notes`` for round-tripping.
+
+The reader also accepts the older RAVEN files (id/name nested in ``metaData``).
+"""
+from __future__ import annotations
+
+import gzip
+from collections import OrderedDict
+from pathlib import Path
+
+import cobra
+from cobra.io.dict import model_from_dict, model_to_dict
+from cobra.io.yaml import yaml as _cobra_yaml # ruamel round-trip YAML (handles !!omap)
+
+
+def _open_text(path: str | Path, mode: str):
+ """Open ``path`` as a text handle, transparently gzipping when it ends ``.gz``."""
+ if str(path).endswith(".gz"):
+ return gzip.open(path, f"{mode}t", encoding="utf-8")
+ return open(path, mode, encoding="utf-8")
+
+# RAVEN-only top-level per-entry keys -> the key used inside the cobra object's
+# .notes dict. ('notes' is RAVEN's free-text metNotes/rxnNotes; stored under
+# 'note' to avoid colliding with the notes container itself.)
+_MET_FIELDS = (("inchis", "inchis"), ("deltaG", "deltaG"), ("metFrom", "metFrom"), ("notes", "note"))
+_RXN_FIELDS = (
+ ("confidence_score", "confidence_score"),
+ ("references", "references"),
+ ("rxnFrom", "rxnFrom"),
+ ("deltaG", "deltaG"),
+ ("notes", "note"),
+)
+_GENE_FIELDS = (("protein", "protein"),)
+
+_COBRA_TOP_KEYS = frozenset({"metabolites", "reactions", "genes", "compartments", "id", "name"})
+
+
+def _to_plain(obj):
+ if isinstance(obj, dict):
+ return {str(k): _to_plain(v) for k, v in obj.items()}
+ if isinstance(obj, (list, tuple)):
+ return [_to_plain(v) for v in obj]
+ if isinstance(obj, bool) or obj is None:
+ return obj
+ if isinstance(obj, int):
+ return int(obj)
+ if isinstance(obj, float):
+ return float(obj)
+ return obj if isinstance(obj, str) else str(obj)
+
+
+def _capture_entry_fields(entries, fields):
+ """Pop RAVEN-only top-level keys off each entry into a parallel notes dict.
+
+ Returns a list of ``{notes_key: value}`` dicts aligned with ``entries`` (so
+ cobra never sees these keys), to be attached to the built objects afterwards.
+ """
+ captured = []
+ for entry in entries:
+ notes = {}
+ for yaml_key, notes_key in fields:
+ if yaml_key in entry:
+ notes[notes_key] = entry.pop(yaml_key)
+ captured.append(notes)
+ return captured
+
+
+def read_yaml_model(path: str | Path) -> cobra.Model:
+ """Read a RAVEN/cobrapy YAML model into a ``cobra.Model``."""
+ with _open_text(path, "r") as handle:
+ raw = _to_plain(_cobra_yaml.load(handle))
+
+ if not isinstance(raw, dict):
+ raise ValueError(f"{path}: top-level YAML is a {type(raw).__name__}, not a mapping.")
+
+ metadata = raw.pop("metaData", None) or {}
+ version = raw.pop("version", None)
+ foreign = {k: raw.pop(k) for k in list(raw) if k not in _COBRA_TOP_KEYS}
+
+ met_notes = _capture_entry_fields(raw.get("metabolites", []), _MET_FIELDS)
+ rxn_notes = _capture_entry_fields(raw.get("reactions", []), _RXN_FIELDS)
+ gene_notes = _capture_entry_fields(raw.get("genes", []), _GENE_FIELDS)
+
+ model = model_from_dict(raw)
+
+ for met, notes in zip(model.metabolites, met_notes, strict=False):
+ met.notes = notes
+ for rxn, notes in zip(model.reactions, rxn_notes, strict=False):
+ rxn.notes = notes
+ for gene, notes in zip(model.genes, gene_notes, strict=False):
+ gene.notes = notes
+
+ # Legacy files keep id/name inside metaData; restore them if cobra found none.
+ if metadata.get("id") and not model.id:
+ model.id = metadata["id"]
+ if metadata.get("name") and not model.name:
+ model.name = metadata["name"]
+ if metadata:
+ model.notes["metaData"] = metadata
+ if version is not None:
+ model.notes["version"] = version
+ if foreign:
+ model.notes["_yaml_sections"] = foreign
+
+ return model
+
+
+def _emit_entry_fields(entries, fields):
+ """Lift RAVEN-only keys out of each entry's ``notes`` dict to top level."""
+ for entry in entries:
+ notes = entry.pop("notes", None)
+ if not isinstance(notes, dict):
+ continue
+ notes = dict(notes)
+ for yaml_key, notes_key in fields:
+ if notes_key in notes:
+ entry[yaml_key] = notes.pop(notes_key)
+ # Preserve any remaining (non-RAVEN) notes. The RAVEN free-text note is lifted
+ # to the YAML key "notes"; if leftovers also exist, merge them with it under
+ # that key (rather than silently dropping the leftovers).
+ if notes:
+ if "notes" in entry:
+ notes["note"] = entry["notes"]
+ entry["notes"] = notes
+
+
+def write_yaml_model(
+ model: cobra.Model, path: str | Path, *, sort_ids: bool = False
+) -> None:
+ """Write a ``cobra.Model`` to RAVEN/cobrapy (``!!omap``) YAML.
+
+ With ``sort_ids=True`` metabolites/reactions/genes/compartments are written
+ in alphabetical order (diff-friendly), without modifying ``model``.
+ """
+ model_notes = dict(model.notes or {})
+ stored_meta = model_notes.pop("metaData", None) or {}
+ version = model_notes.pop("version", None)
+ foreign = model_notes.pop("_yaml_sections", None) or {}
+
+ doc = OrderedDict(_to_plain(model_to_dict(model)))
+
+ if sort_ids:
+ for section in ("metabolites", "reactions", "genes"):
+ if section in doc:
+ doc[section] = sorted(doc[section], key=lambda e: e.get("id", ""))
+ if isinstance(doc.get("compartments"), dict):
+ doc["compartments"] = dict(sorted(doc["compartments"].items()))
+
+ _emit_entry_fields(doc.get("metabolites", []), _MET_FIELDS)
+ _emit_entry_fields(doc.get("reactions", []), _RXN_FIELDS)
+ _emit_entry_fields(doc.get("genes", []), _GENE_FIELDS)
+
+ # cobra dict order is metabolites, reactions, genes, id, name, compartments;
+ # append version / gecko_light / metaData / ec-* like RAVEN's writer.
+ if version is not None:
+ doc["version"] = version
+ metadata = dict(stored_meta)
+ if model.id:
+ metadata.setdefault("id", model.id)
+ if model.name:
+ metadata.setdefault("name", model.name)
+ for key in ("gecko_light",):
+ if key in foreign:
+ doc[key] = foreign.pop(key)
+ if metadata:
+ doc["metaData"] = metadata
+ for key, value in foreign.items():
+ doc[key] = value
+
+ with _open_text(path, "w") as handle:
+ _cobra_yaml.dump(doc, handle)
diff --git a/src/raven_python/manipulation/__init__.py b/src/raven_python/manipulation/__init__.py
new file mode 100644
index 0000000..074c36f
--- /dev/null
+++ b/src/raven_python/manipulation/__init__.py
@@ -0,0 +1,36 @@
+"""Generic cobra.Model structural transforms that cobrapy does not cover cleanly:
+reaction building from equations, batch GPR / bound changes, irreversibility splitting,
+isozyme expansion, compartment merge / copy, and model merging by name."""
+from .add import add_reactions_from_equations
+from .change import change_gene_reaction_rules, change_reaction_equations
+from .expand import expand_model
+from .irreversible import convert_to_irreversible
+from .merge import merge_models
+from .parameters import set_variance_bounds
+from .remove import remove_genes, remove_metabolites
+from .simplify import (
+ constrain_reversible_reactions,
+ group_linear_reactions,
+ remove_dead_end_reactions,
+ remove_duplicate_reactions,
+)
+from .transfer import add_reactions_from_model
+from .transport import add_transport_reactions
+
+__all__ = [
+ "add_reactions_from_equations",
+ "add_reactions_from_model",
+ "add_transport_reactions",
+ "change_gene_reaction_rules",
+ "change_reaction_equations",
+ "constrain_reversible_reactions",
+ "convert_to_irreversible",
+ "expand_model",
+ "group_linear_reactions",
+ "merge_models",
+ "remove_dead_end_reactions",
+ "remove_duplicate_reactions",
+ "remove_genes",
+ "remove_metabolites",
+ "set_variance_bounds",
+]
diff --git a/src/raven_python/manipulation/add.py b/src/raven_python/manipulation/add.py
new file mode 100644
index 0000000..3842297
--- /dev/null
+++ b/src/raven_python/manipulation/add.py
@@ -0,0 +1,345 @@
+"""Add reactions to a model from equation strings.
+
+Most of the equivalent MATLAB code is struct-of-arrays bookkeeping (padding parallel
+``rxnNames`` / ``lb`` / ``ub`` / ``grRules`` / ... fields) that does not exist in
+cobra, where each ``Reaction`` carries its own attributes. cobra also already
+covers a large part of the *behaviour*:
+
+* ``Reaction.build_reaction_from_string`` parses equation strings, coefficients,
+ and reversibility arrows (``<=>``, ``-->``, ``=>``) and creates unknown
+ metabolites — but only matching metabolites **by ID**, and it leaves new
+ metabolites with ``compartment=None``.
+* assigning ``reaction.gene_reaction_rule`` auto-creates ``Gene`` objects.
+
+So this port keeps only the parts cobra lacks:
+
+* **name-based matching** — interpret equation tokens as metabolite *names*
+ (RAVEN eqnType 2) or as ``name[comp]`` (eqnType 3), not just IDs;
+* **correct compartment** assignment for newly created metabolites;
+* **strict policies** — optionally *error* (rather than silently create) on
+ unknown metabolites or genes, and always error on a duplicate reaction ID
+ (cobra silently ignores those).
+
+Instead of RAVEN's ``eqnType`` integer (1/2/3) the matching mode is a readable
+keyword: ``mets_by="id"`` or ``mets_by="name"``, with ``name[comp]`` recognised
+automatically. See IMPROVEMENTS.md (A-series) for the rationale.
+"""
+from __future__ import annotations
+
+import re
+import warnings
+from collections import OrderedDict
+from collections.abc import Mapping, Sequence
+
+import cobra
+from cobra import Metabolite, Reaction
+from cobra.core.gene import GPR
+
+from raven_python.utils.parse import parse_name_comp
+
+# Reversibility arrows. ``<=>`` must be tried before ``=>`` (it contains it).
+_REVERSIBLE_ARROWS = ("<=>",)
+_FORWARD_ARROWS = ("-->", "->", "=>")
+
+
+def _split_equation(equation: str) -> tuple[str, str, bool]:
+ """Split an equation into (lhs, rhs, reversible) on its arrow."""
+ for arrow in _REVERSIBLE_ARROWS:
+ if arrow in equation:
+ lhs, rhs = equation.split(arrow, 1)
+ return lhs, rhs, True
+ for arrow in _FORWARD_ARROWS:
+ if arrow in equation:
+ lhs, rhs = equation.split(arrow, 1)
+ return lhs, rhs, False
+ raise ValueError(f"No reaction arrow (<=>, -->, =>) found in equation: {equation!r}")
+
+
+def _parse_side(side: str) -> list[tuple[float, str, str | None]]:
+ """Parse one side of an equation into ``[(coefficient, token, fallback), ...]``.
+
+ The ``fallback`` slot is for the ambiguous ``" "`` shape: when
+ matching by name, ``"2 oxoglutarate"`` could be either ``coeff=2, name="oxoglutarate"``
+ or ``coeff=1, name="2 oxoglutarate"`` (a real chemistry name). We return the
+ coefficient-split form as the primary and the full term as the fallback; the
+ resolver picks whichever matches an existing metabolite. Pure-number heads
+ with no name (``"2"``) and pure-name terms (``"glucose"``) have no fallback.
+ """
+ terms: list[tuple[float, str, str | None]] = []
+ for raw in side.split(" + "):
+ term = raw.strip()
+ if not term:
+ continue
+ head, _, tail = term.partition(" ")
+ try:
+ coeff = float(head)
+ token = tail.strip()
+ except ValueError:
+ coeff, token = 1.0, term
+ fallback = None
+ else:
+ # Coefficient-split succeeded. Keep the full term as a fallback when
+ # the tail is non-empty so name-resolution can re-try it as one token.
+ fallback = term if token else None
+ if not token:
+ raise ValueError(f"Missing metabolite after coefficient in term: {raw!r}")
+ terms.append((coeff, token, fallback))
+ return terms
+
+
+def _new_met_id(model: cobra.Model, prefix: str) -> str:
+ """Next free ```` metabolite ID (RAVEN m1, m2, ... scheme)."""
+ pattern = re.compile(rf"^{re.escape(prefix)}(\d+)$")
+ used = [int(m.group(1)) for met in model.metabolites if (m := pattern.match(met.id))]
+ n = max(used) + 1 if used else 1
+ while f"{prefix}{n}" in model.metabolites:
+ n += 1
+ return f"{prefix}{n}"
+
+
+def _try_existing(
+ model: cobra.Model, token: str, *, mets_by: str, compartment: str | None
+) -> Metabolite | None:
+ """Look up ``token`` as an existing metabolite (no creation, no side effects).
+
+ Returns the matching metabolite or ``None``. Used by ``_stoichiometry`` to
+ disambiguate the ``" "`` shape: if a metabolite whose *name*
+ (or id) literally contains a leading number exists, prefer it over splitting
+ the number off as a coefficient.
+ """
+ name, comp = parse_name_comp(token)
+ if mets_by == "id" and comp is None:
+ return model.metabolites.get_by_id(token) if token in model.metabolites else None
+ target_comp = comp if comp is not None else compartment
+ if target_comp is None:
+ return None
+ for met in model.metabolites:
+ if met.name == name and met.compartment == target_comp:
+ return met
+ return None
+
+
+def _resolve_metabolite(
+ model: cobra.Model,
+ token: str,
+ *,
+ mets_by: str,
+ compartment: str | None,
+ allow_new_mets: bool,
+ new_met_prefix: str,
+) -> Metabolite:
+ """Resolve an equation token to an existing or newly created Metabolite."""
+ name, comp = parse_name_comp(token)
+
+ if mets_by == "id" and comp is None:
+ # token is a metabolite ID
+ if token in model.metabolites:
+ return model.metabolites.get_by_id(token)
+ if not allow_new_mets:
+ raise ValueError(
+ f"Unknown metabolite ID {token!r}; pass allow_new_mets=True to create it."
+ )
+ if compartment is None:
+ raise ValueError(
+ f"Cannot create metabolite {token!r}: no compartment given."
+ )
+ _warn_unknown_compartment(model, compartment, token)
+ met = Metabolite(token, compartment=compartment)
+ model.add_metabolites([met])
+ return met
+
+ # name-based (mets_by="name") or explicit name[comp]
+ target_comp = comp if comp is not None else compartment
+ if target_comp is None:
+ raise ValueError(
+ f"Metabolite {token!r} matched by name needs a compartment; "
+ "pass compartment=... or use the name[comp] syntax."
+ )
+ if comp is not None and target_comp not in model.compartments and not allow_new_mets:
+ raise ValueError(f"Compartment {target_comp!r} is not in the model.")
+
+ matches = [
+ met
+ for met in model.metabolites
+ if met.name == name and met.compartment == target_comp
+ ]
+ if matches:
+ return matches[0]
+ if not allow_new_mets:
+ raise ValueError(
+ f"No metabolite named {name!r} in compartment {target_comp!r}; "
+ "pass allow_new_mets=True to create it."
+ )
+ _warn_unknown_compartment(model, target_comp, name)
+ met = Metabolite(_new_met_id(model, new_met_prefix), name=name, compartment=target_comp)
+ model.add_metabolites([met])
+ return met
+
+
+def _warn_unknown_compartment(model: cobra.Model, compartment: str, identifier: str) -> None:
+ """Warn when a new metabolite would be born into a not-yet-registered compartment.
+
+ Both ``mets_by`` paths previously created the metabolite without validating
+ the compartment, so a typo (``"cyto"`` for ``"c"``) silently produced a
+ one-metabolite ghost compartment. cobra inherits the compartment from the
+ first metabolite assigned to it, so the fix is a warning, not a hard error.
+ """
+ known = set(model.compartments) | set(model._compartments)
+ if compartment not in known:
+ warnings.warn(
+ f"Creating metabolite {identifier!r} in unregistered compartment "
+ f"{compartment!r} (existing: {sorted(known) or 'none'}); "
+ "add the compartment first or check for a typo.",
+ stacklevel=5,
+ )
+
+
+def _stoichiometry(
+ model: cobra.Model,
+ equation: str,
+ *,
+ mets_by: str,
+ compartment: str | None,
+ allow_new_mets: bool,
+ new_met_prefix: str,
+) -> tuple[dict[Metabolite, float], bool]:
+ """Parse an equation into a {Metabolite: net coefficient} dict + reversibility."""
+ lhs, rhs, reversible = _split_equation(equation)
+ coeffs: OrderedDict[Metabolite, float] = OrderedDict()
+ had_terms = False
+ for sign, side in ((-1.0, lhs), (1.0, rhs)):
+ for coeff, token, fallback in _parse_side(side):
+ had_terms = True
+ # " " is ambiguous when the name itself starts with a
+ # number (e.g. "2 oxoglutarate"). Prefer the full-term interpretation
+ # when it matches an existing metabolite — otherwise fall through to
+ # the coefficient-split form.
+ met = None
+ if fallback is not None:
+ met = _try_existing(
+ model, fallback, mets_by=mets_by, compartment=compartment
+ )
+ if met is not None:
+ coeff = 1.0
+ if met is None:
+ met = _resolve_metabolite(
+ model,
+ token,
+ mets_by=mets_by,
+ compartment=compartment,
+ allow_new_mets=allow_new_mets,
+ new_met_prefix=new_met_prefix,
+ )
+ coeffs[met] = coeffs.get(met, 0.0) + sign * coeff
+ # Drop metabolites that net to zero (present as both substrate and product).
+ coeffs = OrderedDict((met, c) for met, c in coeffs.items() if c != 0.0)
+ if had_terms and not coeffs:
+ warnings.warn(
+ f"Equation {equation!r} has no net metabolites (all terms cancelled); "
+ "the reaction will be added with empty stoichiometry.",
+ stacklevel=4,
+ )
+ return dict(coeffs), reversible
+
+
+def add_reactions_from_equations(
+ model: cobra.Model,
+ reactions: Sequence[Mapping],
+ *,
+ mets_by: str = "id",
+ compartment: str | None = None,
+ allow_new_mets: bool = True,
+ allow_new_genes: bool = True,
+ new_met_prefix: str = "m",
+) -> list[Reaction]:
+ """Add reactions defined by equation strings, matching mets by ID or name.
+ Parameters
+ ----------
+ model
+ Target ``cobra.Model``, mutated in place.
+ reactions
+ Sequence of mappings, one per reaction. Recognised keys:
+
+ * ``id`` (**required**) — reaction ID; must not already exist.
+ * ``equation`` (**required**) — e.g. ``"atp_c + h2o_c <=> adp_c + pi_c"``.
+ Use ``<=>`` for reversible, ``-->``/``->``/``=>`` for irreversible.
+ * ``name`` — reaction name.
+ * ``bounds`` — ``(lower, upper)`` tuple; overrides the arrow.
+ * ``gene_reaction_rule`` — GPR string.
+ * ``subsystem`` — subsystem name.
+ mets_by
+ How bare equation tokens (without ``[comp]``) are matched:
+ ``"id"`` (RAVEN eqnType 1) or ``"name"`` (eqnType 2). A ``name[comp]``
+ token (eqnType 3) is always matched by name + compartment.
+ compartment
+ Default compartment for new metabolites and for name-matched tokens
+ without an explicit ``[comp]``.
+ allow_new_mets
+ If True (default), create metabolites not found. New metabolites get
+ ``compartment`` (id mode) or an auto ID ``m1``, ``m2``, ... (name mode).
+ If False, an unknown metabolite raises.
+ allow_new_genes
+ If True (default), genes in a GPR are auto-created by cobra. If False,
+ a GPR referencing a gene not already in the model raises.
+ new_met_prefix
+ Prefix for auto-generated metabolite IDs in name mode (default ``"m"``).
+
+ Returns
+ -------
+ list of cobra.Reaction
+ The reactions added, in input order.
+ """
+ if mets_by not in ("id", "name"):
+ raise ValueError(f"mets_by must be 'id' or 'name', got {mets_by!r}")
+
+ known_genes = {gene.id for gene in model.genes}
+ added: list[Reaction] = []
+
+ for spec in reactions:
+ if "id" not in spec:
+ raise ValueError(f"Reaction spec missing required 'id': {spec!r}")
+ rxn_id = spec["id"]
+ if rxn_id in model.reactions:
+ raise ValueError(
+ f"Reaction {rxn_id!r} already exists; use changeRxns or remove it first."
+ )
+ if "equation" not in spec:
+ raise ValueError(f"Reaction {rxn_id!r} spec missing required 'equation'.")
+
+ coeffs, reversible = _stoichiometry(
+ model,
+ spec["equation"],
+ mets_by=mets_by,
+ compartment=compartment,
+ allow_new_mets=allow_new_mets,
+ new_met_prefix=new_met_prefix,
+ )
+
+ rxn = Reaction(rxn_id, name=spec.get("name", ""))
+ if "bounds" in spec:
+ rxn.bounds = tuple(spec["bounds"])
+ else:
+ config = cobra.Configuration()
+ lower = config.lower_bound if reversible else 0.0
+ rxn.bounds = (lower, config.upper_bound)
+ if "subsystem" in spec:
+ rxn.subsystem = spec["subsystem"]
+
+ model.add_reactions([rxn])
+ rxn.add_metabolites(coeffs)
+
+ rule = spec.get("gene_reaction_rule", "")
+ if rule:
+ if not allow_new_genes:
+ missing = sorted(set(GPR.from_string(rule).genes) - known_genes)
+ if missing:
+ raise ValueError(
+ f"Reaction {rxn_id!r} references genes not in the model: "
+ f"{missing}. Set allow_new_genes=True or add them first."
+ )
+ rxn.gene_reaction_rule = rule
+ known_genes.update(gene.id for gene in rxn.genes)
+
+ added.append(rxn)
+
+ return added
diff --git a/src/raven_python/manipulation/change.py b/src/raven_python/manipulation/change.py
new file mode 100644
index 0000000..78612ba
--- /dev/null
+++ b/src/raven_python/manipulation/change.py
@@ -0,0 +1,125 @@
+"""Change the stoichiometry of existing reactions from equation strings.
+
+Editing the same ``Reaction`` object changes only its stoichiometry — its id, name,
+bounds, GPR, subsystem, and position are preserved automatically by cobra.
+
+So this port simply re-parses the equation (reusing the same metabolite
+matching as :func:`~raven_python.manipulation.add.add_reactions_from_equations`,
+including name and ``name[comp]`` modes that cobra lacks) and swaps the
+metabolites in place.
+
+Like RAVEN, **bounds are left unchanged** even if the new equation's arrow
+implies a different reversibility — use a bounds setter for that.
+"""
+from __future__ import annotations
+
+from collections.abc import Mapping
+
+import cobra
+from cobra import Reaction
+
+from raven_python.manipulation.add import _stoichiometry
+
+__all__ = ["change_reaction_equations", "change_gene_reaction_rules"]
+
+
+def change_reaction_equations(
+ model: cobra.Model,
+ equations: Mapping[str, str],
+ *,
+ mets_by: str = "id",
+ compartment: str | None = None,
+ allow_new_mets: bool = True,
+ new_met_prefix: str = "m",
+) -> list[Reaction]:
+ """Replace the stoichiometry of existing reactions.
+ Parameters
+ ----------
+ model
+ Target ``cobra.Model``, mutated in place.
+ equations
+ Mapping of ``reaction_id -> equation string``. Every ID must already
+ exist in the model. Equation syntax is identical to
+ :func:`~raven_python.manipulation.add.add_reactions_from_equations`.
+ mets_by, compartment, allow_new_mets, new_met_prefix
+ Metabolite-matching options, as in ``add_reactions_from_equations``.
+
+ Returns
+ -------
+ list of cobra.Reaction
+ The reactions changed, in input order.
+
+ Notes
+ -----
+ Bounds are **not** modified, matching RAVEN. Changing an equation from
+ ``-->`` to ``<=>`` does not by itself make the reaction reversible; adjust
+ the bounds separately.
+ """
+ if mets_by not in ("id", "name"):
+ raise ValueError(f"mets_by must be 'id' or 'name', got {mets_by!r}")
+
+ changed: list[Reaction] = []
+ for rxn_id, equation in equations.items():
+ if rxn_id not in model.reactions:
+ raise ValueError(f"Reaction {rxn_id!r} not found in the model.")
+ rxn = model.reactions.get_by_id(rxn_id)
+
+ coeffs, _reversible = _stoichiometry(
+ model,
+ equation,
+ mets_by=mets_by,
+ compartment=compartment,
+ allow_new_mets=allow_new_mets,
+ new_met_prefix=new_met_prefix,
+ )
+
+ rxn.subtract_metabolites(dict(rxn.metabolites), combine=True)
+ rxn.add_metabolites(coeffs)
+ changed.append(rxn)
+
+ return changed
+
+
+def change_gene_reaction_rules(
+ model: cobra.Model,
+ rules: Mapping[str, str],
+ *,
+ replace: bool = True,
+) -> list[Reaction]:
+ """Set or append gene-reaction rules on existing reactions.
+ cobra already does the heavy lifting on assignment to
+ ``reaction.gene_reaction_rule``: it auto-creates any new ``Gene`` objects and
+ normalises the rule. So the value here is batching plus RAVEN's ``replace``
+ option to **append** rather than overwrite.
+
+ Parameters
+ ----------
+ model
+ Target ``cobra.Model``, mutated in place.
+ rules
+ Mapping of ``reaction_id -> GPR string``. Every ID must already exist.
+ replace
+ If True (default), overwrite the existing GPR. If False, append the new
+ rule as an isozyme: ``(old) or (new)`` (just ``new`` if the reaction had
+ no GPR).
+
+ Returns
+ -------
+ list of cobra.Reaction
+ The reactions changed, in input order.
+ """
+ changed: list[Reaction] = []
+ for rxn_id, rule in rules.items():
+ if rxn_id not in model.reactions:
+ raise ValueError(f"Reaction {rxn_id!r} not found in the model.")
+ rxn = model.reactions.get_by_id(rxn_id)
+
+ if replace or not rxn.gene_reaction_rule:
+ new_rule = rule
+ else:
+ new_rule = f"({rxn.gene_reaction_rule}) or ({rule})"
+
+ rxn.gene_reaction_rule = new_rule # cobra creates genes + normalises
+ changed.append(rxn)
+
+ return changed
diff --git a/src/raven_python/manipulation/compartments.py b/src/raven_python/manipulation/compartments.py
new file mode 100644
index 0000000..091d196
--- /dev/null
+++ b/src/raven_python/manipulation/compartments.py
@@ -0,0 +1,196 @@
+"""Compartment manipulation — merge all compartments into one, or copy reactions to a
+new compartment (ports of RAVEN's ``mergeCompartments`` and ``copyToComps``).
+
+Both functions are useful **independently of** :func:`raven_python.localization.predict_localization`:
+``merge_compartments`` flattens a multi-compartment model for a simplified analysis
+(e.g. checking whether the network can in principle make a metabolite, with no
+compartment topology in the way); ``copy_to_compartment`` is a building block for
+constructing dual-localised pathways. cobra has no equivalents.
+"""
+from __future__ import annotations
+
+from collections.abc import Iterable
+
+import cobra
+
+# Compartments produced by merge_compartments (RAVEN uses 's' for "system").
+_MERGED_COMPARTMENT = "s"
+
+
+def merge_compartments(
+ model: cobra.Model,
+ *,
+ merged_id: str = _MERGED_COMPARTMENT,
+ merged_name: str = "system",
+ drop_single_metabolite_reactions: bool = True,
+ deduplicate_reactions: bool = True,
+) -> tuple[cobra.Model, list[str], list[str]]:
+ """Merge every metabolite of ``model`` into one ``merged_id`` compartment.
+
+ Returns ``(model_copy, deleted_single_met_reactions, deduplicated_reactions)``. The
+ returned model is a deep copy of the input. Use cases:
+
+ * Check whether the network can produce/consume a metabolite at all (compartment
+ topology is often what makes a model look blocked).
+ * Simplify a model for visualisation or an analysis that doesn't care about
+ compartments.
+ * As a pre-step for localisation when the user does want RAVEN's
+ "start from scratch" workflow (call :func:`merge_compartments` then
+ :func:`raven_python.localization.predict_localization` with the full reaction list).
+
+ Metabolites that already share a base id (e.g. ``glc__D_c`` and ``glc__D_e`` both
+ map to ``glc__D``) collapse into one entity in the merged compartment; their
+ stoichiometric contributions are summed per reaction. Reactions that end up with
+ only one metabolite (e.g. ``A[c] → A[m]`` becomes ``A → A`` = nothing) are deleted
+ by default (RAVEN's ``deleteRxnsWithOneMet``). Reactions that become identical
+ after merging are deduplicated (one survives).
+ """
+ out = model.copy()
+
+ # 1. For each metabolite, derive a base id (strip the trailing _).
+ # Two mets in different compartments sharing the base id collapse to one.
+ new_to_old: dict[str, list[cobra.Metabolite]] = {}
+ for m in list(out.metabolites):
+ base = _base_id(m)
+ new_to_old.setdefault(base, []).append(m)
+
+ # 2. Build the merged metabolites and rewrite reactions.
+ canonical: dict[str, cobra.Metabolite] = {}
+ for base, mets in new_to_old.items():
+ proto = mets[0]
+ new_met = cobra.Metabolite(base, name=proto.name, compartment=merged_id,
+ formula=proto.formula, charge=proto.charge)
+ new_met.notes = dict(proto.notes or {})
+ canonical[base] = new_met
+
+ # Rewrite all reactions: replace each metabolite with its canonical, summing
+ # coefficients where multiple original mets collapse to one.
+ rewritten: dict[str, dict[str, float]] = {}
+ for r in list(out.reactions):
+ new_stoich: dict[cobra.Metabolite, float] = {}
+ for m, coeff in list(r.metabolites.items()):
+ canon = canonical[_base_id(m)]
+ new_stoich[canon] = new_stoich.get(canon, 0.0) + coeff
+ # Drop zero net coefficients (substrate + product of the same base met cancel).
+ new_stoich = {m: c for m, c in new_stoich.items() if c != 0.0}
+ rewritten[r.id] = {m.id: c for m, c in new_stoich.items()}
+
+ # Now build a fresh model with the canonical mets + rewritten reactions; the
+ # cobra in-place rewrite would require careful constraint surgery, so a clean
+ # rebuild is simpler and less error-prone.
+ merged = cobra.Model(out.id or "merged")
+ merged.compartments = {merged_id: merged_name}
+ merged.add_metabolites(list(canonical.values()))
+ deleted_single: list[str] = []
+ deduplicated: list[str] = []
+ seen_signatures: dict[tuple, str] = {}
+ keep_reactions: list[cobra.Reaction] = []
+ for r in out.reactions:
+ stoich = rewritten[r.id]
+ if drop_single_metabolite_reactions and len(stoich) <= 1:
+ deleted_single.append(r.id)
+ continue
+ if not stoich: # everything cancelled
+ deleted_single.append(r.id)
+ continue
+ sig = (frozenset(stoich.items()), bool(r.lower_bound < 0), bool(r.upper_bound > 0))
+ if deduplicate_reactions and sig in seen_signatures:
+ deduplicated.append(r.id)
+ continue
+ seen_signatures[sig] = r.id
+ new_r = cobra.Reaction(r.id, name=r.name, lower_bound=r.lower_bound,
+ upper_bound=r.upper_bound)
+ new_r.add_metabolites({merged.metabolites.get_by_id(mid): c for mid, c in stoich.items()})
+ new_r.gene_reaction_rule = r.gene_reaction_rule
+ if r.subsystem:
+ new_r.subsystem = r.subsystem
+ new_r.notes = dict(r.notes or {})
+ keep_reactions.append(new_r)
+ merged.add_reactions(keep_reactions)
+ return merged, deleted_single, deduplicated
+
+
+def copy_to_compartment(
+ model: cobra.Model,
+ reactions: Iterable[str],
+ target_compartment: str,
+ *,
+ target_compartment_name: str | None = None,
+ delete_original: bool = False,
+ id_suffix: str | None = None,
+) -> tuple[cobra.Model, list[str], list[str]]:
+ """Copy a set of reactions into ``target_compartment``. RAVEN's ``copyToComps``.
+
+ Returns ``(model_copy, new_reaction_ids, new_metabolite_ids)``. Use cases:
+
+ * Build a dual-localised pathway (e.g. duplicate glycolysis into a peroxisome).
+ * Mirror a curated subsystem into an additional compartment as a draft to refine.
+ * Set up the input for a flux comparison between alternate compartmentalisations.
+
+ Each copied reaction is given the id ``"_"`` (default
+ ``id_suffix=target_compartment``); each metabolite it touches is mapped to (or
+ created in) ``target_compartment`` with the same suffix convention. ``delete_original=True``
+ moves the reactions instead of copying.
+ """
+ out = model.copy()
+ suffix = id_suffix if id_suffix is not None else target_compartment
+ if target_compartment not in out.compartments:
+ out.compartments = {**out.compartments,
+ target_compartment: target_compartment_name or target_compartment}
+
+ preexisting_met_ids = {x.id for x in out.metabolites}
+ new_rxn_ids: list[str] = []
+ for rid in list(reactions):
+ if rid not in out.reactions:
+ raise ValueError(f"reaction {rid!r} not in model")
+ src = out.reactions.get_by_id(rid)
+ new_id = f"{rid}_{suffix}"
+ if new_id in out.reactions:
+ continue # already copied; idempotent
+ new_stoich: dict[cobra.Metabolite, float] = {}
+ for m, coeff in src.metabolites.items():
+ target_met = _met_in_compartment(out, m, target_compartment, suffix=suffix)
+ new_stoich[target_met] = coeff
+ new_r = cobra.Reaction(new_id, name=src.name,
+ lower_bound=src.lower_bound, upper_bound=src.upper_bound)
+ new_r.add_metabolites(new_stoich)
+ new_r.gene_reaction_rule = src.gene_reaction_rule
+ if src.subsystem:
+ new_r.subsystem = src.subsystem
+ new_r.notes = dict(src.notes or {})
+ out.add_reactions([new_r])
+ new_rxn_ids.append(new_id)
+ if delete_original:
+ out.remove_reactions([src.id], remove_orphans=False)
+
+ new_met_ids = [m.id for m in out.metabolites if m.id not in preexisting_met_ids]
+ return out, new_rxn_ids, new_met_ids
+
+
+# ----------------------------------------------------------------- helpers
+
+def _base_id(m: cobra.Metabolite) -> str:
+ """Strip the trailing ``_`` suffix from a metabolite id (if present)."""
+ if m.compartment and m.id.endswith(f"_{m.compartment}"):
+ return m.id[: -(len(m.compartment) + 1)]
+ return m.id
+
+
+def _met_in_compartment(model: cobra.Model, source: cobra.Metabolite,
+ compartment: str, *, suffix: str | None = None) -> cobra.Metabolite:
+ """Return (creating if needed) the copy of ``source`` in ``compartment``.
+
+ The new metabolite id is ``"_"`` (default ``suffix=compartment``).
+ Already-existing copies are reused.
+ """
+ if source.compartment == compartment:
+ return source
+ base = _base_id(source)
+ new_id = f"{base}_{suffix if suffix is not None else compartment}"
+ if new_id in model.metabolites:
+ return model.metabolites.get_by_id(new_id)
+ new_met = cobra.Metabolite(new_id, name=source.name, compartment=compartment,
+ formula=source.formula, charge=source.charge)
+ new_met.notes = dict(source.notes or {})
+ model.add_metabolites([new_met])
+ return new_met
diff --git a/src/raven_python/manipulation/expand.py b/src/raven_python/manipulation/expand.py
new file mode 100644
index 0000000..246f3b9
--- /dev/null
+++ b/src/raven_python/manipulation/expand.py
@@ -0,0 +1,124 @@
+"""Expand reactions with isozymes into one reaction per isozyme.
+
+Operates on cobra's GPR AST, so the model stays a plain ``cobra.Model`` throughout.
+
+Provenance: this implementation was first written for geckopy
+(``geckopy/ec_model/pipeline/expand.py``, where it backed makeEcModel stage 5)
+and is adopted here as its canonical home; geckopy will import it from raven_python
+once raven_python is published.
+
+MATLAB-COMPAT: GECKO MATLAB and RAVEN ``expandModel.m`` use string manipulation
+on grRules to detect and split isozymes. raven_python uses cobrapy's GPR AST
+instead. Output should be equivalent for any well-formed GPR; cases that differ
+are likely malformed GPR strings that the AST flags as invalid.
+"""
+from __future__ import annotations
+
+import ast
+import copy
+
+import cobra
+from cobra.core.gene import GPR
+
+
+def _gpr_to_dnf(gpr: GPR) -> list[list[str]]:
+ """Convert a GPR to disjunctive normal form (list of AND-clauses).
+
+ An empty GPR yields an empty list. A single clause (no OR anywhere)
+ yields a list of length 1. OR-of-ANDs yields one sublist per
+ disjunct, each containing the gene names ANDed together.
+
+ Handles distributivity: ``g1 and (g2 or g3)`` becomes
+ ``[[g1, g2], [g1, g3]]``.
+ """
+ if gpr is None or gpr.body is None:
+ return []
+ return _node_to_dnf(gpr.body)
+
+
+def _node_to_dnf(node) -> list[list[str]]:
+ """Recursive helper. Returns DNF as list of AND-clauses."""
+ if isinstance(node, ast.Name):
+ return [[node.id]]
+ if isinstance(node, ast.BoolOp):
+ if isinstance(node.op, ast.Or):
+ result: list[list[str]] = []
+ for child in node.values:
+ result.extend(_node_to_dnf(child))
+ return result
+ if isinstance(node.op, ast.And):
+ clauses: list[list[str]] = [[]]
+ for child in node.values:
+ child_dnf = _node_to_dnf(child)
+ new_clauses: list[list[str]] = []
+ for existing in clauses:
+ for extra in child_dnf:
+ new_clauses.append(existing + extra)
+ clauses = new_clauses
+ return clauses
+ raise ValueError(f"Unexpected GPR node type: {type(node).__name__}")
+
+
+def expand_model(model: cobra.Model) -> list[str]:
+ """Split reactions with isozymes (OR in GPR) into one reaction per isozyme.
+ For each reaction whose GPR contains at least one OR, the reaction
+ is removed and replaced by one copy per disjunctive clause. The new
+ reactions get ID suffix ``_EXP_1``, ``_EXP_2``, etc. All other
+ fields (stoichiometry, bounds, name, subsystem) are copied verbatim;
+ only the GPR is simplified to the single AND-clause for that
+ isozyme.
+
+ Reactions with no GPR, or with a GPR that has no OR, are left
+ untouched.
+
+ Parameters
+ ----------
+ model
+ A cobra.Model, mutated in place.
+
+ Returns
+ -------
+ list of str
+ Sorted IDs of newly added expanded reactions (those with
+ ``_EXP_N`` suffixes). The original reactions that were split
+ are no longer in the model.
+ """
+ expansions: list[tuple[cobra.Reaction, list[list[str]]]] = []
+
+ for rxn in model.reactions:
+ if not rxn.gene_reaction_rule:
+ continue
+ clauses = _gpr_to_dnf(rxn.gpr)
+ if len(clauses) <= 1:
+ continue
+ expansions.append((rxn, clauses))
+
+ added_ids: list[str] = []
+ for original_rxn, clauses in expansions:
+ new_rxns: list[cobra.Reaction] = []
+ for i, clause in enumerate(clauses, start=1):
+ new_rxn = cobra.Reaction(
+ id=f"{original_rxn.id}_EXP_{i}",
+ name=original_rxn.name,
+ )
+ new_rxn.lower_bound = original_rxn.lower_bound
+ new_rxn.upper_bound = original_rxn.upper_bound
+ new_rxn.add_metabolites(dict(original_rxn.metabolites.items()))
+ new_rxn.subsystem = original_rxn.subsystem
+ new_rxn.gene_reaction_rule = " and ".join(clause)
+ # Propagate per-reaction metadata (notably ec-code / annotations)
+ # so downstream functions see the same annotations on expanded
+ # reactions as on the original. Deep-copy so siblings are independent.
+ new_rxn.annotation = copy.deepcopy(original_rxn.annotation)
+ new_rxn.notes = copy.deepcopy(original_rxn.notes)
+ new_rxns.append(new_rxn)
+
+ obj_coeff = original_rxn.objective_coefficient
+ model.remove_reactions([original_rxn])
+ model.add_reactions(new_rxns)
+ if obj_coeff: # keep the original in the objective — sum over its isozyme copies
+ for new_rxn in new_rxns:
+ new_rxn.objective_coefficient = obj_coeff
+ added_ids.extend(r.id for r in new_rxns)
+
+ return sorted(added_ids)
diff --git a/src/raven_python/manipulation/irreversible.py b/src/raven_python/manipulation/irreversible.py
new file mode 100644
index 0000000..3f64a68
--- /dev/null
+++ b/src/raven_python/manipulation/irreversible.py
@@ -0,0 +1,72 @@
+"""Convert reversible reactions to an irreversible (forward + reverse) form.
+
+cobrapy's own ``convert_to_irreversible`` was removed, so this is a genuine
+implementation rather than a wrapper.
+
+Provenance: first written for geckopy
+(``geckopy/ec_model/pipeline/preprocess.py``, makeEcModel stage 4, tagged
+"RAVENpy candidate") and adopted here as its canonical home; geckopy will
+import it from raven_python once raven_python is published.
+"""
+from __future__ import annotations
+
+import cobra
+
+
+def convert_to_irreversible(model: cobra.Model) -> list[str]:
+ """Split non-exchange reversible reactions into a forward + reverse pair.
+ For each non-exchange reaction with ``lb < 0``:
+
+ - The original reaction is kept as the forward direction. Its
+ lower bound is clamped to 0.
+ - A new reaction with the same ID plus a ``_REV`` suffix is added,
+ representing the reverse direction. Its stoichiometry is the
+ negation of the original, its bounds are ``(0, -original_lb)``,
+ and it inherits the name (with " (reversible)" appended) and the
+ gene-protein rule of the original.
+
+ Exchange reactions (boundary reactions) are never split, regardless
+ of their bounds, matching MATLAB behavior where exchange reactions
+ are explicitly excluded from ``convertToIrrev``.
+
+ Parameters
+ ----------
+ model
+ A cobra.Model, mutated in place.
+
+ Returns
+ -------
+ list of str
+ Sorted IDs of newly added reverse reactions (the ones ending in
+ ``_REV``). The forward reactions retain their original IDs.
+ """
+ reverse_rxns_to_add: list[cobra.Reaction] = []
+ forward_updates: list[cobra.Reaction] = []
+
+ for rxn in model.reactions:
+ if rxn.boundary:
+ continue
+ if rxn.lower_bound >= 0:
+ continue
+
+ original_lb = rxn.lower_bound
+
+ rev_rxn = cobra.Reaction(
+ id=f"{rxn.id}_REV",
+ name=(f"{rxn.name} (reversible)" if rxn.name else f"{rxn.id}_REV"),
+ )
+ rev_rxn.lower_bound = 0.0
+ rev_rxn.upper_bound = -original_lb
+ rev_rxn.add_metabolites({m: -c for m, c in rxn.metabolites.items()})
+ rev_rxn.gene_reaction_rule = rxn.gene_reaction_rule
+
+ reverse_rxns_to_add.append(rev_rxn)
+ forward_updates.append(rxn)
+
+ for rxn in forward_updates:
+ rxn.lower_bound = 0.0
+
+ if reverse_rxns_to_add:
+ model.add_reactions(reverse_rxns_to_add)
+
+ return sorted(r.id for r in reverse_rxns_to_add)
diff --git a/src/raven_python/manipulation/merge.py b/src/raven_python/manipulation/merge.py
new file mode 100644
index 0000000..bfa1f24
--- /dev/null
+++ b/src/raven_python/manipulation/merge.py
@@ -0,0 +1,146 @@
+"""Merge several models into one.
+
+cobra's ``Model.merge`` is pairwise and matches everything strictly by id; this
+merges **N** models and unifies metabolites by **name[compartment]** (so the same
+compound under different ids in two models becomes one), while adding **all**
+reactions without de-duplication
+(a reaction whose ID already exists is renamed ``id_``). Genes are
+unified by ID. Provenance (which source model each object came from) is recorded
+in ``notes['origin']``.
+
+The bulk of RAVEN's function is struct field-padding and manual S-matrix
+assembly, none of which is needed on ``cobra.Model``.
+"""
+from __future__ import annotations
+
+import copy
+import warnings
+from collections.abc import Iterable
+
+import cobra
+from cobra import Metabolite, Model, Reaction
+
+
+def _unique_id(existing, base: str, suffix: str) -> str:
+ """Return base, or base_suffix (then base_suffix_2, ...) if it collides."""
+ if base not in existing:
+ return base
+ candidate = f"{base}_{suffix}"
+ n = 2
+ while candidate in existing:
+ candidate = f"{base}_{suffix}_{n}"
+ n += 1
+ return candidate
+
+
+def merge_models(
+ models: Iterable[cobra.Model],
+ *,
+ match_by: str = "name",
+ track_origin: bool = True,
+) -> cobra.Model:
+ """Merge models into a single new model.
+ Parameters
+ ----------
+ models
+ The models to merge (two or more). A single model is returned as a copy.
+ match_by
+ How metabolites are unified across models: ``"name"`` (default) treats
+ metabolites with the same *name and compartment* as identical (IDs
+ ignored); ``"id"`` matches by metabolite ID.
+ track_origin
+ If True (default), record the source model's ``id`` in each reaction's,
+ metabolite's, and gene's ``notes['origin']``.
+
+ Returns
+ -------
+ cobra.Model
+ A new merged model (``id="MERGED"``). Reactions are **not** de-duplicated
+ — matching RAVEN, every reaction from every model is kept, with ID
+ collisions renamed ``id_``.
+ """
+ models = list(models)
+ if not models:
+ raise ValueError("merge_models requires at least one model.")
+ if match_by not in ("name", "id"):
+ raise ValueError(f"match_by must be 'name' or 'id', got {match_by!r}")
+ if len(models) == 1:
+ return models[0].copy()
+
+ merged = Model("MERGED")
+ comp_names: dict[str, str] = {}
+ met_lookup: dict = {} # name/comp or id key -> merged Metabolite
+
+ def met_key(met: Metabolite):
+ return (met.name, met.compartment) if match_by == "name" else met.id
+
+ def ensure_metabolite(src: Metabolite, origin: str) -> Metabolite:
+ key = met_key(src)
+ if key in met_lookup:
+ existing = met_lookup[key]
+ # Two source models can map to the same name[comp] (or id) with
+ # different formula/charge; silently picking the first-seen has
+ # quietly corrupted mass balance in the past. Warn so the caller
+ # sees the conflict.
+ if src.formula and existing.formula and src.formula != existing.formula:
+ warnings.warn(
+ f"merge_models: metabolite {existing.id!r} (from earlier model) "
+ f"and {src.id!r} (from {origin!r}) share key {key!r} but "
+ f"have different formulas ({existing.formula!r} vs {src.formula!r}); "
+ "keeping the first.",
+ stacklevel=3,
+ )
+ if (
+ existing.charge is not None
+ and src.charge is not None
+ and existing.charge != src.charge
+ ):
+ warnings.warn(
+ f"merge_models: metabolite {existing.id!r} (from earlier model) "
+ f"and {src.id!r} (from {origin!r}) share key {key!r} but "
+ f"have different charges ({existing.charge} vs {src.charge}); "
+ "keeping the first.",
+ stacklevel=3,
+ )
+ return existing
+ new_id = _unique_id(merged.metabolites, src.id, origin)
+ new_met = Metabolite(
+ new_id, name=src.name, compartment=src.compartment,
+ formula=src.formula, charge=src.charge,
+ )
+ new_met.annotation = copy.deepcopy(src.annotation)
+ new_met.notes = copy.deepcopy(src.notes)
+ if track_origin:
+ new_met.notes.setdefault("origin", origin)
+ merged.add_metabolites([new_met])
+ met_lookup[key] = new_met
+ return new_met
+
+ for model in models:
+ origin = model.id or "model"
+ comp_names.update(model.compartments)
+ genes_before = {g.id for g in merged.genes}
+
+ for rxn in model.reactions:
+ new_id = _unique_id(merged.reactions, rxn.id, origin)
+ new_rxn = Reaction(new_id, name=rxn.name)
+ new_rxn.bounds = rxn.bounds
+ new_rxn.subsystem = rxn.subsystem
+ merged.add_reactions([new_rxn])
+ new_rxn.add_metabolites(
+ {ensure_metabolite(m, origin): coef for m, coef in rxn.metabolites.items()}
+ )
+ if rxn.gene_reaction_rule:
+ new_rxn.gene_reaction_rule = rxn.gene_reaction_rule
+ new_rxn.annotation = copy.deepcopy(rxn.annotation)
+ new_rxn.notes = copy.deepcopy(rxn.notes)
+ if track_origin:
+ new_rxn.notes.setdefault("origin", origin)
+
+ if track_origin:
+ for gene in merged.genes:
+ if gene.id not in genes_before:
+ gene.notes.setdefault("origin", origin)
+
+ merged._compartments.update(comp_names)
+ return merged
diff --git a/src/raven_python/manipulation/parameters.py b/src/raven_python/manipulation/parameters.py
new file mode 100644
index 0000000..f349804
--- /dev/null
+++ b/src/raven_python/manipulation/parameters.py
@@ -0,0 +1,78 @@
+"""Set reaction bounds to a sign-aware ±% variance band around measured values.
+
+Cobra has no idiom for the *variance band* case (e.g. "5 ± 20 %"); the other common
+bound-setting cases are cobra one-liners:
+
+* fixed lb / ub → ``reaction.lower_bound`` / ``upper_bound`` / ``reaction.bounds``
+* equality → ``reaction.bounds = (v, v)``
+* objective → ``model.objective = {reaction: coeff}``
+* unconstrained → ``reaction.bounds = cobra.Configuration().bounds``
+"""
+from __future__ import annotations
+
+from collections.abc import Iterable, Sequence
+
+import cobra
+from cobra import Reaction
+
+Number = int | float
+
+
+def _resolve(model: cobra.Model, reactions) -> list[Reaction]:
+ if isinstance(reactions, (str, Reaction)):
+ reactions = [reactions]
+ out: list[Reaction] = []
+ for r in reactions:
+ if isinstance(r, Reaction):
+ out.append(r)
+ elif r in model.reactions:
+ out.append(model.reactions.get_by_id(r))
+ else:
+ raise ValueError(f"Reaction {r!r} not found in the model.")
+ return out
+
+
+def _broadcast(value, n: int) -> list[float]:
+ if isinstance(value, (int, float)):
+ return [float(value)] * n
+ vals = [float(v) for v in value]
+ if len(vals) != n:
+ raise ValueError(
+ f"Expected 1 or {n} values to match the reactions, got {len(vals)}."
+ )
+ return vals
+
+
+def set_variance_bounds(
+ model: cobra.Model,
+ reactions: str | Reaction | Iterable,
+ values: Number | Sequence[Number],
+ percent: Number,
+) -> list[Reaction]:
+ """Constrain reactions to a ``±percent/2`` band around measured values.
+
+ For a measured value ``v`` and ``percent`` ``p``, the bounds become
+ ``v * (1 - p/200) .. v * (1 + p/200)`` — i.e. ``percent`` is the *total*
+ width, split half above and half below. For a negative ``v`` the two are
+ swapped so that ``lb <= ub``. E.g. ``percent=5`` gives 97.5 %..102.5 % of ``v``.
+
+ Parameters
+ ----------
+ reactions
+ Reaction IDs or objects.
+ values
+ Measured value per reaction; a scalar is broadcast to all reactions.
+ percent
+ Total band width as a percentage.
+
+ Returns
+ -------
+ list of cobra.Reaction
+ The reactions affected.
+ """
+ rxns = _resolve(model, reactions)
+ half = percent / 200.0
+ for rxn, v in zip(rxns, _broadcast(values, len(rxns)), strict=True):
+ lo, hi = v * (1 - half), v * (1 + half)
+ rxn.bounds = (hi, lo) if v < 0 else (lo, hi)
+ return rxns
diff --git a/src/raven_python/manipulation/remove.py b/src/raven_python/manipulation/remove.py
new file mode 100644
index 0000000..492de36
--- /dev/null
+++ b/src/raven_python/manipulation/remove.py
@@ -0,0 +1,120 @@
+"""Remove metabolites or genes from a model.
+
+For removing *reactions*, use cobra directly:
+``cobra.Model.remove_reactions(reactions, remove_orphans=...)``.
+
+The two functions here delegate the core to cobra and add the cobra-absent behaviour:
+
+* ``remove_metabolites`` — cobra matches metabolites by ID; RAVEN's ``isNames``
+ deletes a metabolite in **every compartment at once** by name. That name
+ resolution is the *sole* reason this wrapper exists (see the note on it).
+* ``remove_genes`` — cobra's ``cobra.manipulation.remove_genes`` already rewrites
+ GPRs through the boolean AST (removing one gene of ``A and B`` empties the
+ rule, of ``A or B`` keeps the other) — exactly RAVEN's intent, without its
+ ``eval``. The gap is RAVEN's default of **constraining** flux-blocked reactions
+ to zero instead of deleting them; exposed as ``blocked_reactions``.
+"""
+from __future__ import annotations
+
+from collections.abc import Iterable
+
+import cobra
+from cobra import Gene, Metabolite
+from cobra.manipulation import remove_genes as _cobra_remove_genes
+
+
+def _as_list(obj) -> list:
+ if isinstance(obj, (str, Metabolite, Gene)):
+ return [obj]
+ return list(obj)
+
+
+def remove_metabolites(
+ model: cobra.Model,
+ metabolites: str | Metabolite | Iterable,
+ *,
+ by_name: bool = False,
+ destructive: bool = False,
+) -> None:
+ """Remove metabolites, optionally matching by name across all compartments.
+
+ Parameters
+ ----------
+ by_name
+ If True, ``metabolites`` are metabolite *names*; every metabolite with a
+ matching name is removed, regardless of compartment (RAVEN ``isNames``).
+ If False, they are IDs/objects, resolved via cobra.
+ destructive
+ Passed to cobra: if True, also remove every reaction the metabolite
+ participates in.
+
+ Note
+ ----
+ With ``by_name=False`` this is just ``model.remove_metabolites`` — the
+ ``by_name`` cross-compartment deletion is the only thing this adds over cobra.
+ """
+ if by_name:
+ wanted = set(_as_list(metabolites))
+ targets = [m for m in model.metabolites if m.name in wanted]
+ else:
+ targets = model.metabolites.get_by_any(_as_list(metabolites))
+ if targets:
+ model.remove_metabolites(targets, destructive=destructive)
+
+
+def remove_genes(
+ model: cobra.Model,
+ genes: str | Gene | Iterable,
+ *,
+ blocked_reactions: str = "remove",
+ remove_orphans: bool = False,
+) -> list[str]:
+ """Remove genes and handle reactions left unable to carry flux.
+
+ GPR rewriting (with correct AND/OR semantics) and gene deletion are done by cobra;
+ this adds a policy for reactions whose GPR becomes empty (no enzyme left):
+
+ * ``"remove"`` — delete them (cobra's default).
+ * ``"constrain"`` — keep them but set bounds to ``(0, 0)``.
+ * ``"keep"`` — leave them with an empty GPR and unchanged bounds.
+
+ ``remove_orphans`` (only meaningful with ``blocked_reactions="remove"``)
+ passes through to cobra: drop metabolites *and* genes orphaned by the removal.
+
+ Returns
+ -------
+ list of str
+ IDs of the reactions that became flux-blocked (had a GPR, now empty).
+ """
+ if blocked_reactions not in ("remove", "constrain", "keep"):
+ raise ValueError(
+ f"blocked_reactions must be 'remove', 'constrain', or 'keep', "
+ f"got {blocked_reactions!r}"
+ )
+
+ # Resolve to gene IDs that are actually in the model (RAVEN filters likewise).
+ requested = [g.id if isinstance(g, Gene) else g for g in _as_list(genes)]
+ present = [gid for gid in requested if gid in model.genes]
+ if not present:
+ return []
+
+ # Reactions touched by these genes that currently have a GPR.
+ affected = set()
+ for gid in present:
+ affected.update(r.id for r in model.genes.get_by_id(gid).reactions)
+ had_gpr = {rid for rid in affected if model.reactions.get_by_id(rid).gene_reaction_rule}
+
+ # cobra rewrites GPRs (AST) and removes the gene objects; we manage reactions.
+ _cobra_remove_genes(model, present, remove_reactions=False)
+
+ blocked = [
+ rid for rid in had_gpr if not model.reactions.get_by_id(rid).gene_reaction_rule
+ ]
+
+ if blocked_reactions == "remove":
+ model.remove_reactions(blocked, remove_orphans=remove_orphans)
+ elif blocked_reactions == "constrain":
+ for rid in blocked:
+ model.reactions.get_by_id(rid).bounds = (0, 0)
+
+ return sorted(blocked)
diff --git a/src/raven_python/manipulation/simplify.py b/src/raven_python/manipulation/simplify.py
new file mode 100644
index 0000000..2deaccd
--- /dev/null
+++ b/src/raven_python/manipulation/simplify.py
@@ -0,0 +1,229 @@
+"""Reduce a model by removing/merging reactions that cannot carry flux.
+
+Four reduction modes that cobra does not cover out of the box:
+``remove_dead_end_reactions`` (reactions whose substrates have no producer),
+``remove_duplicate_reactions``, ``constrain_reversible_reactions`` (tighten bounds
+via FVA), and ``group_linear_reactions`` (lossy fold of unit-stoichiometry chains
+into one reaction; drops gene rules).
+
+Cobra-covered modes that you'd reach for separately:
+
+* No-flux removal → ``cobra.flux_analysis.find_blocked_reactions``.
+* Zero-interval removal → filter reactions with ``bounds == (0, 0)`` then prune.
+"""
+from __future__ import annotations
+
+import math
+from collections.abc import Iterable
+
+import cobra
+from cobra.flux_analysis import flux_variability_analysis
+
+from raven_python.manipulation.irreversible import convert_to_irreversible
+
+
+def _prune_orphan_metabolites(model: cobra.Model) -> list[str]:
+ orphans = [m for m in model.metabolites if not m.reactions]
+ if orphans:
+ model.remove_metabolites(orphans)
+ return [m.id for m in orphans]
+
+
+def _can_produce_and_consume(met) -> tuple[bool, bool]:
+ """Whether the network can both produce and consume ``met`` (given directions)."""
+ produce = consume = False
+ for rxn in met.reactions:
+ coef = rxn.get_coefficient(met)
+ if coef > 0:
+ produce |= rxn.upper_bound > 0
+ consume |= rxn.lower_bound < 0
+ elif coef < 0:
+ consume |= rxn.upper_bound > 0
+ produce |= rxn.lower_bound < 0
+ return produce, consume
+
+
+def remove_dead_end_reactions(
+ model: cobra.Model, *, reserved: Iterable[str] | None = None
+) -> tuple[list[str], list[str]]:
+ """Iteratively remove dead-end reactions and metabolites.
+
+ A metabolite
+ is a dead end if it participates in only one reaction, or if (accounting for
+ reaction directionality) it can only be produced or only consumed — such
+ metabolites cannot carry steady-state flux, so the reactions touching them
+ are removed. Repeats until stable.
+
+ Returns ``(removed_reaction_ids, removed_metabolite_ids)``.
+ """
+ reserved = set(reserved or [])
+ removed_rxns: list[str] = []
+ removed_mets: list[str] = []
+ while True:
+ removed_mets += _prune_orphan_metabolites(model)
+ dead = [
+ m
+ for m in model.metabolites
+ if len(m.reactions) <= 1 or not all(_can_produce_and_consume(m))
+ ]
+ if not dead:
+ break
+ rxns = {r for m in dead for r in m.reactions}
+ to_delete = [r for r in rxns if r.id not in reserved]
+ if not to_delete:
+ break
+ removed_rxns += [r.id for r in to_delete]
+ model.remove_reactions(to_delete)
+ return removed_rxns, removed_mets
+
+
+def _signature(rxn):
+ mets = frozenset((m.id, c) for m, c in rxn.metabolites.items())
+ return (mets, rxn.lower_bound, rxn.upper_bound, rxn.objective_coefficient)
+
+
+def remove_duplicate_reactions(
+ model: cobra.Model, *, reserved: Iterable[str] | None = None
+) -> list[str]:
+ """Remove all-but-one of each set of duplicate reactions.
+
+ Reactions are duplicates when they have identical stoichiometry, bounds, and
+ objective coefficient. One of each set is kept (reserved reactions are never
+ removed). Returns the removed reaction IDs.
+ """
+ reserved = set(reserved or [])
+ groups: dict = {}
+ for rxn in model.reactions:
+ groups.setdefault(_signature(rxn), []).append(rxn)
+
+ removed: list[str] = []
+ for rxns in groups.values():
+ if len(rxns) <= 1:
+ continue
+ keep = rxns[-1]
+ to_remove = [r for r in rxns if r is not keep and r.id not in reserved]
+ if to_remove:
+ removed += [r.id for r in to_remove]
+ model.remove_reactions(to_remove)
+ return removed
+
+
+def constrain_reversible_reactions(
+ model: cobra.Model, *, eps: float = 1e-9
+) -> list[str]:
+ """Constrain reversible reactions that can only carry flux one way.
+
+ Runs FVA on
+ each reversible reaction; if it can only carry forward flux its lower bound
+ is set to 0, and if it can only carry reverse flux it is flipped to a forward
+ reaction (stoichiometry, bounds, and objective negated). Returns the changed
+ reaction IDs.
+ """
+ revs = [r for r in model.reactions if r.lower_bound < 0 < r.upper_bound]
+ if not revs:
+ return []
+ # Infeasible models surface as either OptimizationError (Gurobi/HiGHS) or
+ # NaN-filled ranges (some optlang backends silently). Catch both and raise
+ # a single clear error — the original ``abs(NaN) < eps`` comparison would
+ # have silently no-op'd, letting bogus "all reactions truly reversible"
+ # decisions sneak through.
+ try:
+ fva = flux_variability_analysis(
+ model, reaction_list=revs, fraction_of_optimum=0.0
+ )
+ except Exception as exc: # noqa: BLE001 - solver-family agnostic
+ raise RuntimeError(
+ "constrain_reversible_reactions: FVA failed — the model is likely "
+ "infeasible at fraction_of_optimum=0. Fix the infeasibility first "
+ "(often a missing exchange or an over-constrained essential). "
+ f"({exc})"
+ ) from exc
+ if fva[["minimum", "maximum"]].isna().any().any():
+ raise RuntimeError(
+ "constrain_reversible_reactions: FVA returned NaN ranges — the "
+ "model is infeasible at fraction_of_optimum=0. Fix the infeasibility "
+ "first (often a missing exchange or an over-constrained essential)."
+ )
+
+ changed: list[str] = []
+ for rxn in revs:
+ lo = fva.at[rxn.id, "minimum"]
+ hi = fva.at[rxn.id, "maximum"]
+ # Guard against ±inf ranges (unbounded objective): treat them as truly
+ # reversible rather than "zero" by the abs(·) < eps check.
+ if math.isinf(lo) or math.isinf(hi):
+ continue
+ min_zero, max_zero = abs(lo) < eps, abs(hi) < eps
+ if min_zero == max_zero: # both ~0 (blocked) or both nonzero (truly reversible)
+ continue
+ if max_zero: # only reverse flux → flip to a forward reaction
+ old_lb = rxn.lower_bound
+ rxn.add_metabolites({m: -2 * c for m, c in rxn.metabolites.items()})
+ rxn.bounds = (0.0, -old_lb)
+ rxn.objective_coefficient = -rxn.objective_coefficient
+ else: # only forward flux
+ rxn.lower_bound = 0.0
+ changed.append(rxn.id)
+ return changed
+
+
+def group_linear_reactions(
+ model: cobra.Model, *, reserved: Iterable[str] | None = None
+) -> None:
+ """Merge linear (single-producer, single-consumer) reaction chains.
+
+ **Lossy**: gene-reaction
+ associations are discarded (RAVEN does the same), since merged reactions have
+ no meaningful combined GPR. The model is first made irreversible, then any
+ metabolite that is produced by exactly one reaction and consumed by exactly
+ one reaction is eliminated by merging the two reactions. Mutates in place.
+ """
+ reserved = set(reserved or [])
+
+ # Lossy: drop all gene information.
+ for rxn in model.reactions:
+ rxn.gene_reaction_rule = ""
+ for gene in list(model.genes):
+ model.genes.remove(gene)
+
+ convert_to_irreversible(model)
+
+ # Worklist of metabolites to (re)consider for merging. Each metabolite
+ # participating in a merge can expose new linear chains in its neighbours,
+ # so we re-enqueue the touched mets rather than restart the whole scan
+ # (the old O(n²·m) restart-after-every-merge loop).
+ pending: list = list(model.metabolites)
+ seen_in_pass: set = set()
+ while pending:
+ met = pending.pop()
+ if met not in model.metabolites: # removed in a previous merge
+ continue
+ rxns = list(met.reactions)
+ if len(rxns) != 2 or any(r.id in reserved for r in rxns):
+ continue
+ r1, r2 = rxns
+ c1, c2 = r1.get_coefficient(met), r2.get_coefficient(met)
+ if (c1 > 0) == (c2 > 0): # need one producer and one consumer
+ continue
+ ratio = abs(c1 / c2)
+ new_lb = max(r1.lower_bound, r2.lower_bound / ratio)
+ new_ub = min(r1.upper_bound, r2.upper_bound / ratio)
+ new_obj = r1.objective_coefficient + r2.objective_coefficient * ratio
+ # Re-enqueue every metabolite touched by either side — the merge can
+ # turn neighbours into single-producer/consumer chains in turn.
+ touched = {m for m in r1.metabolites} | {m for m in r2.metabolites}
+ # Merge r2*ratio into r1; the shared metabolite cancels and is dropped.
+ r1.add_metabolites({m: c * ratio for m, c in r2.metabolites.items()})
+ model.remove_reactions([r2])
+ r1.bounds = (new_lb, new_ub)
+ r1.objective_coefficient = new_obj
+ seen_in_pass.clear()
+ for m in touched:
+ if m in model.metabolites and id(m) not in seen_in_pass:
+ seen_in_pass.add(id(m))
+ pending.append(m)
+ # One terminal cleanup pass (cheap; only what remains).
+ empty = [r for r in model.reactions if not r.metabolites]
+ if empty:
+ model.remove_reactions(empty)
+ _prune_orphan_metabolites(model)
diff --git a/src/raven_python/manipulation/transfer.py b/src/raven_python/manipulation/transfer.py
new file mode 100644
index 0000000..b867f02
--- /dev/null
+++ b/src/raven_python/manipulation/transfer.py
@@ -0,0 +1,144 @@
+"""Copy reactions (with their metabolites and genes) from another model.
+
+cobra's ``Model.merge`` / ``add_reactions`` match metabolites strictly by id. This
+transfers a chosen set of reactions from a *source* model into a draft, matching
+metabolites by **name[compartment]** instead — so a compound present in both models
+under different ids is reused rather than duplicated, and only genuinely new
+metabolites are created (copying the source's id, formula,
+charge, and annotation). New genes are auto-created by cobra when the GPR is set.
+This is the post-``getModelFromHomology`` "copy a few more reactions across"
+workflow.
+"""
+from __future__ import annotations
+
+import copy
+from collections.abc import Iterable
+
+import cobra
+from cobra import Metabolite, Reaction
+
+from raven_python.manipulation.add import _new_met_id
+
+
+def _name_comp(met: Metabolite) -> str:
+ return f"{met.name}[{met.compartment}]"
+
+
+def add_reactions_from_model(
+ model: cobra.Model,
+ source_model: cobra.Model,
+ reactions: str | Iterable[str],
+ *,
+ genes: bool | str | Iterable[str] = False,
+ note: str | None = "Added via add_reactions_from_model()",
+ confidence: int | None = None,
+) -> list[Reaction]:
+ """Copy reactions from ``source_model`` into ``model``.
+ Parameters
+ ----------
+ model
+ Draft model to copy into (mutated in place).
+ source_model
+ Model to copy reactions from.
+ reactions
+ Reaction ID(s) in ``source_model``. Reactions already present in
+ ``model`` (by ID) are skipped.
+ genes
+ ``False`` (default): add reactions without GPRs. ``True``: copy each
+ reaction's GPR from the source. A string: use it as the GPR for every
+ added reaction. A list: per-reaction GPRs (matching the reactions that
+ are actually added). New genes are created automatically.
+ note
+ Stored in each added reaction's ``notes['note']`` (set ``None`` to skip).
+ confidence
+ If given, stored in each added reaction's ``notes['confidence_score']``.
+
+ Returns
+ -------
+ list of cobra.Reaction
+ The reactions added, in input order.
+ """
+ rxn_ids = [reactions] if isinstance(reactions, str) else list(reactions)
+ missing = [r for r in rxn_ids if r not in source_model.reactions]
+ if missing:
+ raise ValueError(f"Reactions not found in the source model: {missing}")
+
+ new_ids = [r for r in rxn_ids if r not in model.reactions]
+ if not new_ids:
+ raise ValueError("All reactions are already in the model.")
+ source_rxns = [source_model.reactions.get_by_id(r) for r in new_ids]
+
+ if genes is False:
+ rules = [""] * len(source_rxns)
+ elif genes is True:
+ rules = [r.gene_reaction_rule for r in source_rxns]
+ elif isinstance(genes, str):
+ rules = [genes] * len(source_rxns)
+ else:
+ rules = list(genes)
+ if len(rules) != len(source_rxns):
+ raise ValueError(
+ f"genes list has {len(rules)} rules but {len(source_rxns)} "
+ "reactions are being added."
+ )
+
+ # Match metabolites by name[comp]; create only the genuinely new ones.
+ draft_by_name = {_name_comp(m): m for m in model.metabolites}
+ new_mets: list[Metabolite] = []
+ pending: set[str] = set()
+ # Track ids minted within this batch so two source mets that share an id
+ # but differ in name[comp] don't collide when add_metabolites runs.
+ pending_ids: set[str] = set()
+ for srx in source_rxns:
+ for met in srx.metabolites:
+ key = _name_comp(met)
+ if key in draft_by_name or key in pending:
+ continue
+ pending.add(key)
+ if met.id not in model.metabolites and met.id not in pending_ids:
+ new_id = met.id
+ else:
+ # _new_met_id only knows the model; loop past in-batch hits too.
+ new_id = _new_met_id(model, "m")
+ while new_id in pending_ids:
+ n = int(new_id[1:]) + 1
+ new_id = f"m{n}"
+ while new_id in model.metabolites:
+ n += 1
+ new_id = f"m{n}"
+ pending_ids.add(new_id)
+ new_met = Metabolite(
+ new_id,
+ name=met.name,
+ compartment=met.compartment,
+ formula=met.formula,
+ charge=met.charge,
+ )
+ new_met.annotation = copy.deepcopy(met.annotation)
+ new_met.notes = copy.deepcopy(met.notes)
+ new_mets.append(new_met)
+ draft_by_name[key] = new_met
+ if new_mets:
+ model.add_metabolites(new_mets)
+
+ added: list[Reaction] = []
+ for srx, rule in zip(source_rxns, rules, strict=True):
+ rxn = Reaction(srx.id, name=srx.name)
+ rxn.bounds = srx.bounds
+ rxn.subsystem = srx.subsystem
+ model.add_reactions([rxn])
+ rxn.add_metabolites(
+ {draft_by_name[_name_comp(met)]: coef for met, coef in srx.metabolites.items()}
+ )
+ if rule:
+ rxn.gene_reaction_rule = rule
+ rxn.annotation = copy.deepcopy(srx.annotation)
+ notes = copy.deepcopy(srx.notes)
+ if note is not None:
+ notes["note"] = note
+ if confidence is not None:
+ notes["confidence_score"] = confidence
+ rxn.notes = notes
+ added.append(rxn)
+
+ return added
diff --git a/src/raven_python/manipulation/transport.py b/src/raven_python/manipulation/transport.py
new file mode 100644
index 0000000..d0c1bf1
--- /dev/null
+++ b/src/raven_python/manipulation/transport.py
@@ -0,0 +1,157 @@
+"""Add transport reactions between compartments.
+
+cobra has no transport-reaction primitive. For each metabolite this matches the
+species by *name* across compartments (the source in ``from_compartment`` and its
+same-named twin in each target compartment), optionally creating the target
+metabolite, and
+builds a ``-1 from / +1 to`` reaction with a sequential ``tr_0001`` ID.
+"""
+from __future__ import annotations
+
+import re
+import warnings
+from collections.abc import Iterable
+
+import cobra
+from cobra import Metabolite, Reaction
+
+from raven_python.manipulation.add import _new_met_id
+
+
+def _index_by_name(mets: Iterable[Metabolite], compartment: str) -> dict[str, Metabolite]:
+ """Index metabolites by name, warning when a name is duplicated.
+
+ Same-name duplicates in a single compartment are unusual but legal in cobra,
+ and the previous one-pass dict comprehension silently dropped all but one.
+ """
+ out: dict[str, list[Metabolite]] = {}
+ for m in mets:
+ out.setdefault(m.name, []).append(m)
+ chosen: dict[str, Metabolite] = {}
+ for name, group in out.items():
+ if len(group) > 1:
+ warnings.warn(
+ f"Multiple metabolites named {name!r} in compartment {compartment!r} "
+ f"({[m.id for m in group]}); using {group[0].id!r} for transport.",
+ stacklevel=3,
+ )
+ chosen[name] = group[0]
+ return chosen
+
+
+def _transport_id_factory(model: cobra.Model, prefix: str):
+ pattern = re.compile(rf"^{re.escape(prefix)}(\d+)$")
+ used = [int(m.group(1)) for r in model.reactions if (m := pattern.match(r.id))]
+ counter = max(used) + 1 if used else 1
+
+ def next_id() -> str:
+ nonlocal counter
+ while f"{prefix}{counter:04d}" in model.reactions:
+ counter += 1
+ rid = f"{prefix}{counter:04d}"
+ counter += 1
+ return rid
+
+ return next_id
+
+
+def add_transport_reactions(
+ model: cobra.Model,
+ from_compartment: str,
+ to_compartments: str | Iterable[str],
+ metabolite_names: str | Iterable[str] | None = None,
+ *,
+ reversible: bool = True,
+ only_to_existing: bool = True,
+ id_prefix: str = "tr_",
+) -> list[Reaction]:
+ """Add transport reactions from one compartment to one or more others.
+ Parameters
+ ----------
+ from_compartment
+ Source compartment id.
+ to_compartments
+ Target compartment id(s).
+ metabolite_names
+ Names of metabolites to transport. Default: every metabolite in
+ ``from_compartment``.
+ reversible
+ If True (default), bounds span the cobra configuration default
+ (reversible); otherwise lower bound 0.
+ only_to_existing
+ If True (default), only transport a metabolite into a target
+ compartment where a same-named metabolite already exists. If False,
+ create the missing target metabolite (copying name/formula/charge/
+ annotation from the source) before adding the transport.
+ id_prefix
+ Prefix for the sequential reaction IDs (``tr_0001``, ...).
+
+ Returns
+ -------
+ list of cobra.Reaction
+ The transport reactions added, in creation order.
+ """
+ # cobra's `model.compartments` only lists compartments that have metabolites;
+ # include registered-but-empty ones so transport can target an empty compartment.
+ known = set(model.compartments) | set(model._compartments)
+ if from_compartment not in known:
+ raise ValueError(f"Compartment {from_compartment!r} is not in the model.")
+ if isinstance(to_compartments, str):
+ to_compartments = [to_compartments]
+ else:
+ to_compartments = list(to_compartments)
+ for comp in to_compartments:
+ if comp not in known:
+ raise ValueError(f"Compartment {comp!r} is not in the model.")
+
+ source = _index_by_name(
+ (m for m in model.metabolites if m.compartment == from_compartment),
+ from_compartment,
+ )
+ if metabolite_names is None:
+ names = list(source)
+ else:
+ names = [metabolite_names] if isinstance(metabolite_names, str) else list(metabolite_names)
+ missing = [n for n in names if n not in source]
+ if missing:
+ raise ValueError(
+ f"Metabolites not found in compartment {from_compartment!r}: {missing}"
+ )
+
+ cfg = cobra.Configuration()
+ bounds = (cfg.lower_bound, cfg.upper_bound) if reversible else (0.0, cfg.upper_bound)
+ from_name = model.compartments.get(from_compartment) or from_compartment
+ next_id = _transport_id_factory(model, id_prefix)
+
+ added: list[Reaction] = []
+ for to_comp in to_compartments:
+ to_name = model.compartments.get(to_comp) or to_comp
+ targets = _index_by_name(
+ (m for m in model.metabolites if m.compartment == to_comp),
+ to_comp,
+ )
+ for name in names:
+ src = source[name]
+ dst = targets.get(name)
+ if dst is None:
+ if only_to_existing:
+ continue
+ dst = Metabolite(
+ _new_met_id(model, "m"),
+ name=name,
+ compartment=to_comp,
+ formula=src.formula,
+ charge=src.charge,
+ )
+ dst.annotation = dict(src.annotation)
+ model.add_metabolites([dst])
+ targets[name] = dst
+
+ rxn = Reaction(next_id())
+ rxn.name = f"{name} transport, {from_name}-{to_name}"
+ rxn.bounds = bounds
+ model.add_reactions([rxn])
+ rxn.add_metabolites({src: -1, dst: 1})
+ added.append(rxn)
+
+ return added
diff --git a/src/raven_python/reconstruction/__init__.py b/src/raven_python/reconstruction/__init__.py
new file mode 100644
index 0000000..a270e2c
--- /dev/null
+++ b/src/raven_python/reconstruction/__init__.py
@@ -0,0 +1 @@
+"""De novo reconstruction from KEGG and protein homology (BLAST/DIAMOND)."""
diff --git a/src/raven_python/reconstruction/homology/__init__.py b/src/raven_python/reconstruction/homology/__init__.py
new file mode 100644
index 0000000..6ed9748
--- /dev/null
+++ b/src/raven_python/reconstruction/homology/__init__.py
@@ -0,0 +1,19 @@
+"""Homology-based reconstruction from template models (getModelFromHomology, BLAST/DIAMOND)."""
+from raven_python.reconstruction.homology.blast import (
+ blast_from_table,
+ run_blast,
+ run_diamond,
+)
+from raven_python.reconstruction.homology.hits import HIT_COLUMNS, make_ortholog_hits, validate_hits
+from raven_python.reconstruction.homology.homology import HomologyResult, get_model_from_homology
+
+__all__ = [
+ "HIT_COLUMNS",
+ "HomologyResult",
+ "blast_from_table",
+ "get_model_from_homology",
+ "make_ortholog_hits",
+ "run_blast",
+ "run_diamond",
+ "validate_hits",
+]
diff --git a/src/raven_python/reconstruction/homology/blast.py b/src/raven_python/reconstruction/homology/blast.py
new file mode 100644
index 0000000..246ddab
--- /dev/null
+++ b/src/raven_python/reconstruction/homology/blast.py
@@ -0,0 +1,146 @@
+"""Run BLAST+ / DIAMOND (or load precomputed hits) into a homology hits table.
+
+Each producer returns the bidirectional hits DataFrame (``HIT_COLUMNS``) consumed by
+:func:`~raven_python.reconstruction.homology.get_model_from_homology`. Binaries are
+located via :func:`raven_python.binaries.resolve_binary` (arg → env → PATH → bundled).
+"""
+from __future__ import annotations
+
+import io
+import subprocess
+import tempfile
+from collections.abc import Sequence
+from pathlib import Path
+
+import pandas as pd
+
+from raven_python.binaries import resolve_binary
+from raven_python.reconstruction.homology.hits import HIT_COLUMNS, validate_hits
+
+# Tabular output columns requested from BLAST+/DIAMOND, in order.
+_OUTFMT_FIELDS = ["qseqid", "sseqid", "evalue", "pident", "length", "bitscore", "ppos"]
+_FIELD_TO_HIT = {
+ "qseqid": "from_gene", "sseqid": "to_gene", "evalue": "evalue",
+ "pident": "identity", "length": "align_len", "bitscore": "bitscore", "ppos": "ppos",
+}
+
+
+def _parse_tabular(text: str, from_id: str, to_id: str, sep: str) -> pd.DataFrame:
+ """Parse one BLAST/DIAMOND tabular output into hit rows for one direction."""
+ if not text.strip():
+ return pd.DataFrame(columns=HIT_COLUMNS)
+ df = pd.read_csv(io.StringIO(text), sep=sep, names=_OUTFMT_FIELDS, dtype={0: str, 1: str})
+ df = df.rename(columns=_FIELD_TO_HIT)
+ df["from_id"] = from_id
+ df["to_id"] = to_id
+ return df[HIT_COLUMNS]
+
+
+def _as_list(x):
+ return [x] if isinstance(x, (str, Path)) else list(x)
+
+
+def _run(cmd: list[str]) -> str:
+ proc = subprocess.run(cmd, capture_output=True, text=True)
+ if proc.returncode != 0:
+ raise RuntimeError(f"{cmd[0]} failed:\n{proc.stderr.strip()}")
+ return proc.stdout
+
+
+def run_blast(
+ organism_id: str,
+ fasta: str | Path,
+ model_ids: Sequence[str],
+ ref_fastas: Sequence[str | Path],
+ *,
+ evalue: float = 1e-5,
+ threads: int = 1,
+ blastp: str | Path | None = None,
+ makeblastdb: str | Path | None = None,
+) -> pd.DataFrame:
+ """Bidirectional BLAST+ between an organism and template organisms.
+
+ Returns the hits DataFrame (filtered at
+ ``evalue``). Requires BLAST+ (`blastp`, `makeblastdb`).
+ """
+ model_ids = list(model_ids)
+ ref_fastas = _as_list(ref_fastas)
+ if len(model_ids) != len(ref_fastas):
+ raise ValueError("model_ids and ref_fastas must have the same length.")
+ blastp = resolve_binary("blastp", binary=blastp)
+ makeblastdb = resolve_binary("makeblastdb", binary=makeblastdb)
+ outfmt = "10 " + " ".join(_OUTFMT_FIELDS) # 10 = CSV
+
+ frames = []
+ with tempfile.TemporaryDirectory() as tmp:
+ tmp = Path(tmp)
+
+ def blastp_dir(query, subject_fasta, from_id, to_id):
+ db = tmp / f"db_{from_id}_{to_id}"
+ _run([makeblastdb, "-in", str(subject_fasta), "-dbtype", "prot", "-out", str(db)])
+ out = _run([
+ blastp, "-query", str(query), "-db", str(db), "-evalue", str(evalue),
+ "-outfmt", outfmt, "-num_threads", str(threads),
+ ])
+ return _parse_tabular(out, from_id, to_id, sep=",")
+
+ for model_id, ref in zip(model_ids, ref_fastas, strict=True):
+ # template -> organism, and organism -> template
+ frames.append(blastp_dir(ref, fasta, model_id, organism_id))
+ frames.append(blastp_dir(fasta, ref, organism_id, model_id))
+ return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame(columns=HIT_COLUMNS)
+
+
+def run_diamond(
+ organism_id: str,
+ fasta: str | Path,
+ model_ids: Sequence[str],
+ ref_fastas: Sequence[str | Path],
+ *,
+ evalue: float = 1e-5,
+ threads: int = 1,
+ sensitivity: str = "--more-sensitive",
+ diamond: str | Path | None = None,
+) -> pd.DataFrame:
+ """Bidirectional DIAMOND between an organism and template organisms.
+
+ Returns the hits DataFrame. Requires DIAMOND.
+ """
+ model_ids = list(model_ids)
+ ref_fastas = _as_list(ref_fastas)
+ if len(model_ids) != len(ref_fastas):
+ raise ValueError("model_ids and ref_fastas must have the same length.")
+ diamond = resolve_binary("diamond", binary=diamond)
+
+ frames = []
+ with tempfile.TemporaryDirectory() as tmp:
+ tmp = Path(tmp)
+
+ def diamond_dir(query, subject_fasta, from_id, to_id):
+ db = tmp / f"db_{from_id}_{to_id}"
+ _run([diamond, "makedb", "--in", str(subject_fasta), "--db", str(db)])
+ cmd = [diamond, "blastp", "--query", str(query), "--db", str(db),
+ "--evalue", str(evalue), "--outfmt", "6", *_OUTFMT_FIELDS,
+ "--threads", str(threads)]
+ if sensitivity:
+ cmd.append(sensitivity)
+ return _parse_tabular(_run(cmd), from_id, to_id, sep="\t")
+
+ for model_id, ref in zip(model_ids, ref_fastas, strict=True):
+ frames.append(diamond_dir(ref, fasta, model_id, organism_id))
+ frames.append(diamond_dir(fasta, ref, organism_id, model_id))
+ return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame(columns=HIT_COLUMNS)
+
+
+def blast_from_table(source: str | Path | pd.DataFrame) -> pd.DataFrame:
+ """Load a precomputed homology hits table (CSV path or DataFrame).
+
+ a plain CSV/DataFrame, not Excel.
+ Must contain the ``HIT_COLUMNS`` columns.
+ """
+ # Force gene-id columns to str: an all-numeric gene-id column (e.g. Entrez ids)
+ # would otherwise be read as int64 and never match the string gene ids in a model.
+ df = (source if isinstance(source, pd.DataFrame)
+ else pd.read_csv(source, dtype={"from_gene": str, "to_gene": str}))
+ validate_hits(df)
+ return df[HIT_COLUMNS].copy()
diff --git a/src/raven_python/reconstruction/homology/hits.py b/src/raven_python/reconstruction/homology/hits.py
new file mode 100644
index 0000000..2f706c3
--- /dev/null
+++ b/src/raven_python/reconstruction/homology/hits.py
@@ -0,0 +1,64 @@
+"""Homology hits table — the data structure shared across the homology track.
+
+The hits are one tidy ``pandas.DataFrame`` of bidirectional hits, one row per hit.
+This is the currency between the BLAST / DIAMOND wrappers and
+:func:`get_model_from_homology`.
+
+Columns (``HIT_COLUMNS``):
+``from_id, to_id`` (organism/model ids), ``from_gene, to_gene`` (the matched
+genes; ``from_gene`` is in ``from_id``), and the hit metrics
+``evalue, identity, align_len, bitscore, ppos``.
+"""
+from __future__ import annotations
+
+from collections.abc import Iterable
+
+import pandas as pd
+
+HIT_COLUMNS = [
+ "from_id", "to_id", "from_gene", "to_gene",
+ "evalue", "identity", "align_len", "bitscore", "ppos",
+]
+
+
+def make_ortholog_hits(
+ ortholog_pairs: Iterable[tuple[str, str]],
+ source_model_id: str,
+ target_id: str,
+) -> pd.DataFrame:
+ """Build a bidirectional hits table from a predefined ortholog list.
+
+ Each ``(source_gene, target_gene)``
+ pair is emitted in both directions with sentinel metrics (evalue 0,
+ identity 100, align_len 1000, bitscore 1000, ppos 100) so every pair passes
+ any reasonable filter. Lets a known ortholog mapping feed
+ :func:`get_model_from_homology` with no BLAST run — also the testing entry
+ point.
+
+ Parameters
+ ----------
+ ortholog_pairs
+ Iterable of ``(source_gene, target_gene)`` — source = template/model
+ organism, target = the organism being built.
+ source_model_id
+ ID of the template model the source genes belong to.
+ target_id
+ ID of the organism to build a model for (``model_for``).
+ """
+ pairs = [(str(s), str(t)) for s, t in ortholog_pairs]
+ if not pairs:
+ raise ValueError("ortholog_pairs is empty.")
+
+ rows = []
+ for source_gene, target_gene in pairs:
+ rows.append((source_model_id, target_id, source_gene, target_gene, 0.0, 100.0, 1000, 1000.0, 100.0))
+ rows.append((target_id, source_model_id, target_gene, source_gene, 0.0, 100.0, 1000, 1000.0, 100.0))
+ return pd.DataFrame(rows, columns=HIT_COLUMNS)
+
+
+def validate_hits(hits: pd.DataFrame) -> pd.DataFrame:
+ """Check a hits DataFrame has the required columns; return it unchanged."""
+ missing = [c for c in HIT_COLUMNS if c not in hits.columns]
+ if missing:
+ raise ValueError(f"hits is missing required columns: {missing}")
+ return hits
diff --git a/src/raven_python/reconstruction/homology/homology.py b/src/raven_python/reconstruction/homology/homology.py
new file mode 100644
index 0000000..bc6fa41
--- /dev/null
+++ b/src/raven_python/reconstruction/homology/homology.py
@@ -0,0 +1,281 @@
+"""Build a draft model from template models + homology hits.
+
+Key behaviour:
+
+* clear ``bidirectional`` / ``best_hits_only`` parameters control the hit-filtering
+ strictness (cleaner than a single overloaded "strictness" knob);
+* GPR rewriting works on cobra's AST, not regex;
+* explicit ``complex_policy`` decides what happens to AND-subunits that lack an
+ ortholog (drop, keep, drop-the-reaction);
+* best-hit selection is bitscore-based;
+* the ortholog map is a DataFrame; provenance is structured.
+"""
+from __future__ import annotations
+
+import ast
+import warnings
+from dataclasses import dataclass, field
+
+import cobra
+import pandas as pd
+
+from raven_python.manipulation.merge import merge_models
+from raven_python.reconstruction.homology.hits import validate_hits
+
+
+@dataclass
+class HomologyResult:
+ """Result of :func:`get_model_from_homology`.
+
+ Attributes
+ ----------
+ model
+ The draft ``cobra.Model``.
+ gene_map
+ ``{model_id: {template_gene: [new_gene, ...]}}`` ortholog mapping used.
+ """
+
+ model: cobra.Model
+ gene_map: dict = field(default_factory=dict)
+
+
+class _Unmapped:
+ """A GPR leaf gene with no ortholog in the new organism."""
+
+ __slots__ = ("gene",)
+
+ def __init__(self, gene: str):
+ self.gene = gene
+
+
+def _rewrite_node(node, ortho: dict, policy: str, model_id: str):
+ """Rewrite a GPR AST node, substituting template genes by their orthologs.
+
+ Returns a GPR sub-expression string, ``None`` (nothing survives), or an
+ ``_Unmapped`` for a bare unmapped leaf (the parent decides what to do).
+ """
+ if isinstance(node, ast.Name):
+ new_genes = ortho.get(node.id)
+ if new_genes:
+ return new_genes[0] if len(new_genes) == 1 else "(" + " or ".join(new_genes) + ")"
+ return _Unmapped(node.id)
+
+ if isinstance(node, ast.BoolOp):
+ children = [_rewrite_node(c, ortho, policy, model_id) for c in node.values]
+ if isinstance(node.op, ast.Or):
+ # An isozyme branch with no ortholog is simply absent.
+ parts = [c for c in children if isinstance(c, str)]
+ if not parts:
+ return None
+ return parts[0] if len(parts) == 1 else "(" + " or ".join(parts) + ")"
+ # And: apply the complex policy to unmapped subunits.
+ parts = []
+ for child in children:
+ if isinstance(child, str):
+ parts.append(child)
+ elif isinstance(child, _Unmapped):
+ if policy == "flag":
+ parts.append(f"OLD_{model_id}_{child.gene}")
+ elif policy == "drop":
+ return None # incomplete complex -> reaction unsupported
+ # policy == "keep": drop the unmapped subunit
+ else: # None (a dead sub-branch)
+ if policy == "drop":
+ return None
+ if not parts:
+ return None
+ return parts[0] if len(parts) == 1 else "(" + " and ".join(parts) + ")"
+
+ return None
+
+
+def _rewrite_gpr(rxn, ortho: dict, policy: str, model_id: str):
+ """Return the rewritten GPR string, or None if the reaction is unsupported."""
+ if not rxn.gene_reaction_rule:
+ return None
+ # A reaction is only transferred if at least one of its genes has an ortholog.
+ if not any(g.id in ortho for g in rxn.genes):
+ return None
+ result = _rewrite_node(rxn.gpr.body, ortho, policy, model_id)
+ if isinstance(result, str):
+ return result
+ return None
+
+
+def _strictness_to_params(strictness, bidirectional, best_hits_only, complex_policy, map_direction):
+ """Map RAVEN's strictness 1/2/3 onto the clearer parameters (compat)."""
+ if strictness is None:
+ return bidirectional, best_hits_only, complex_policy, map_direction
+ if strictness == 1:
+ return True, False, complex_policy, map_direction
+ if strictness == 2:
+ return False, False, complex_policy, map_direction
+ if strictness == 3:
+ return True, True, complex_policy, map_direction
+ raise ValueError(f"strictness must be 1, 2 or 3, got {strictness}")
+
+
+def _ortholog_map(
+ hits, model_for, model_ids, *, bidirectional, best_hits_only, score, map_direction,
+ model_genes, max_evalue, min_align_len, min_identity,
+):
+ """Build {model_id: {template_gene: [new_gene, ...]}} from the hits table."""
+ h = hits[
+ (hits.evalue <= max_evalue)
+ & (hits.align_len >= min_align_len)
+ & (hits.identity >= min_identity)
+ ]
+
+ if best_hits_only:
+ ascending = score == "evalue"
+ h = h.sort_values(score, ascending=ascending)
+ h = h.groupby(["from_id", "to_id", "from_gene"], sort=False).head(1)
+
+ # Directional views, normalised to (model_id, new_gene, template_gene).
+ fwd = (
+ h[h.from_id == model_for][["to_id", "from_gene", "to_gene"]]
+ .rename(columns={"to_id": "model_id", "from_gene": "new_gene", "to_gene": "template_gene"})
+ )
+ rev = (
+ h[h.to_id == model_for][["from_id", "from_gene", "to_gene"]]
+ .rename(columns={"from_id": "model_id", "from_gene": "template_gene", "to_gene": "new_gene"})
+ )
+ fwd = fwd[fwd.model_id.isin(model_ids)]
+ rev = rev[rev.model_id.isin(model_ids)]
+
+ if bidirectional:
+ pairs = fwd.merge(rev, on=["model_id", "new_gene", "template_gene"], how="inner")
+ elif map_direction == "new_to_old":
+ pairs = fwd
+ else:
+ pairs = rev
+ pairs = pairs[["model_id", "new_gene", "template_gene"]].drop_duplicates()
+ if pairs.empty:
+ return {}
+
+ # Keep only template genes that actually exist in their model.
+ pairs = pairs[pairs.apply(lambda r: r.template_gene in model_genes.get(r.model_id, ()), axis=1)]
+
+ ortho: dict = {}
+ for model_id, template_gene, new_gene in zip(pairs.model_id, pairs.template_gene, pairs.new_gene, strict=True):
+ ortho.setdefault(model_id, {}).setdefault(template_gene, [])
+ if new_gene not in ortho[model_id][template_gene]:
+ ortho[model_id][template_gene].append(new_gene)
+ for per_model in ortho.values():
+ for genes in per_model.values():
+ genes.sort()
+ return ortho
+
+
+def _apply_preferred_order(ortho: dict, order: list[str]) -> dict:
+ """Each new gene's reactions come from the first model (in order) that maps it."""
+ winner: dict = {} # new_gene -> winning model_id
+ for model_id in order:
+ for new_genes in ortho.get(model_id, {}).values():
+ for ng in new_genes:
+ winner.setdefault(ng, model_id)
+ pruned: dict = {mid: {} for mid in ortho}
+ for model_id, per_model in ortho.items():
+ for template_gene, new_genes in per_model.items():
+ kept = [ng for ng in new_genes if winner.get(ng) == model_id]
+ if kept:
+ pruned[model_id][template_gene] = kept
+ return pruned
+
+
+def get_model_from_homology(
+ models,
+ hits: pd.DataFrame,
+ model_for: str,
+ *,
+ preferred_order=None,
+ bidirectional: bool = True,
+ best_hits_only: bool = False,
+ map_direction: str = "new_to_old",
+ score: str = "bitscore",
+ complex_policy: str = "flag",
+ only_genes_in_models: bool = False,
+ max_evalue: float = 1e-30,
+ min_align_len: int = 200,
+ min_identity: float = 40,
+ strictness: int | None = None,
+) -> HomologyResult:
+ """Build a draft model for ``model_for`` by transferring reactions from templates.
+
+ ``strictness`` (1/2/3) is a legacy alias for ``bidirectional`` / ``best_hits_only``.
+ """
+ if isinstance(models, cobra.Model):
+ models = [models]
+ if complex_policy not in ("flag", "keep", "drop"):
+ raise ValueError(f"complex_policy must be flag/keep/drop, got {complex_policy!r}")
+ if map_direction not in ("new_to_old", "old_to_new"):
+ raise ValueError(f"map_direction must be new_to_old/old_to_new, got {map_direction!r}")
+ bidirectional, best_hits_only, complex_policy, map_direction = _strictness_to_params(
+ strictness, bidirectional, best_hits_only, complex_policy, map_direction
+ )
+ validate_hits(hits)
+
+ model_by_id = {m.id: m for m in models}
+ model_ids = list(model_by_id)
+ model_genes = {mid: {g.id for g in m.genes} for mid, m in model_by_id.items()}
+ all_model_genes = set().union(*model_genes.values()) if model_genes else set()
+
+ # Sanity: each template should overlap the hits by >=5% of its genes.
+ for mid, genes in model_genes.items():
+ in_hits = genes & (set(hits.from_gene) | set(hits.to_gene))
+ if genes and len(in_hits) < 0.05 * len(genes):
+ warnings.warn(
+ f"<5% of genes in template '{mid}' appear in the hits table; "
+ "check that the FASTA and model use the same gene identifiers.",
+ stacklevel=2,
+ )
+
+ if only_genes_in_models:
+ hits = hits[hits.from_gene.isin(all_model_genes) | hits.to_gene.isin(all_model_genes)]
+
+ ortho = _ortholog_map(
+ hits, model_for, model_ids, bidirectional=bidirectional, best_hits_only=best_hits_only,
+ score=score, map_direction=map_direction,
+ model_genes=model_genes, max_evalue=max_evalue, min_align_len=min_align_len,
+ min_identity=min_identity,
+ )
+
+ order = [str(x) for x in preferred_order] if preferred_order else model_ids
+ if preferred_order and len(models) > 1:
+ ortho = _apply_preferred_order(ortho, order)
+
+ # Build a per-template model holding only the transferred reactions with rewritten GPRs.
+ transferred = []
+ for mid in order:
+ model = model_by_id.get(mid)
+ if model is None:
+ continue
+ per_model = ortho.get(mid, {})
+ m = model.copy()
+ keep: dict[str, str] = {}
+ for rxn in m.reactions:
+ new_gpr = _rewrite_gpr(rxn, per_model, complex_policy, mid)
+ if new_gpr is not None:
+ keep[rxn.id] = new_gpr
+ m.remove_reactions([r for r in m.reactions if r.id not in keep], remove_orphans=True)
+ for rid, gpr in keep.items():
+ r = m.reactions.get_by_id(rid)
+ r.gene_reaction_rule = gpr
+ r.notes = {"note": "Included by get_model_from_homology", "confidence_score": 2,
+ "homology_source": mid}
+ if m.reactions:
+ transferred.append(m)
+
+ if transferred:
+ draft = merge_models(transferred, match_by="name")
+ else:
+ draft = cobra.Model()
+ draft.id = model_for
+ draft.name = "Generated by get_model_from_homology using " + ", ".join(model_ids)
+
+ # Drop OLD_ placeholder genes that ended up orphaned (none survive in OR branches by construction).
+ orphan_genes = [g for g in draft.genes if not g.reactions]
+ for g in orphan_genes:
+ draft.genes.remove(g)
+
+ return HomologyResult(model=draft, gene_map=ortho)
diff --git a/src/raven_python/reconstruction/kegg/__init__.py b/src/raven_python/reconstruction/kegg/__init__.py
new file mode 100644
index 0000000..5d27602
--- /dev/null
+++ b/src/raven_python/reconstruction/kegg/__init__.py
@@ -0,0 +1,77 @@
+"""KEGG-based draft reconstruction (getKEGGModelForOrganism and friends).
+
+Maintainer build steps: 3b.1 download (:mod:`.download`), 3b.2 dump parsing
+(:mod:`.parse`), 3b.3 HMM libraries (:mod:`.hmm`, :mod:`.taxonomy`). Runtime:
+3b.4 model for a KEGG species (:mod:`.organism`).
+"""
+from raven_python.reconstruction.kegg.download import (
+ download_kegg_dump,
+ extract_kegg_dump,
+ fetch_kegg_files,
+)
+from raven_python.reconstruction.kegg.hmm import (
+ build_hmm_library,
+ build_ko_fastas,
+ build_ko_hmm,
+)
+from raven_python.reconstruction.kegg.organism import (
+ get_kegg_model_for_organism,
+ get_kegg_model_for_organism_from_artefacts,
+)
+from raven_python.reconstruction.kegg.parse import (
+ KeggCompound,
+ KeggKO,
+ KeggReaction,
+ build_kegg_tables,
+ build_reference_model,
+ parse_kegg_compounds,
+ parse_kegg_dump,
+ parse_kegg_kos,
+ parse_kegg_reactions,
+ read_kegg_table,
+ stream_organism_gene_ko,
+ write_kegg_tables,
+)
+from raven_python.reconstruction.kegg.query import (
+ assign_kos,
+ get_kegg_model_from_sequences,
+ get_kegg_model_from_sequences_with_artefacts,
+ parse_hmmscan_tblout,
+ run_hmmscan,
+)
+from raven_python.reconstruction.kegg.taxonomy import (
+ organism_domains,
+ organisms_in_domain,
+ parse_taxonomy,
+)
+
+__all__ = [
+ "KeggCompound",
+ "KeggKO",
+ "KeggReaction",
+ "assign_kos",
+ "build_hmm_library",
+ "build_kegg_tables",
+ "build_ko_fastas",
+ "build_ko_hmm",
+ "build_reference_model",
+ "download_kegg_dump",
+ "extract_kegg_dump",
+ "fetch_kegg_files",
+ "get_kegg_model_for_organism",
+ "get_kegg_model_for_organism_from_artefacts",
+ "get_kegg_model_from_sequences",
+ "get_kegg_model_from_sequences_with_artefacts",
+ "organism_domains",
+ "organisms_in_domain",
+ "parse_hmmscan_tblout",
+ "parse_kegg_compounds",
+ "parse_kegg_dump",
+ "parse_kegg_kos",
+ "parse_kegg_reactions",
+ "parse_taxonomy",
+ "read_kegg_table",
+ "run_hmmscan",
+ "stream_organism_gene_ko",
+ "write_kegg_tables",
+]
diff --git a/src/raven_python/reconstruction/kegg/assemble.py b/src/raven_python/reconstruction/kegg/assemble.py
new file mode 100644
index 0000000..a2b5eb9
--- /dev/null
+++ b/src/raven_python/reconstruction/kegg/assemble.py
@@ -0,0 +1,82 @@
+"""Shared assembly of a draft model from a KO→genes mapping.
+
+Both KEGG runtime paths end the same way: having decided which genes belong to
+which KO — from organism annotations (3b.4) or from HMM hits (3b.5) — they map
+KO→reaction against the gene-free reference model, OR-join the genes into each
+reaction's GPR, keep gene-backed reactions (plus spontaneous ones when allowed),
+and apply the ``keep*`` quality filters. That common tail lives here.
+"""
+from __future__ import annotations
+
+import cobra
+import pandas as pd
+
+_DOMAINS = {"eukaryotes", "prokaryotes"}
+
+
+def flag_set(rxn_flags: pd.DataFrame | None, column: str) -> set[str]:
+ """Reaction ids whose ``column`` flag is truthy (handles bool or TSV strings)."""
+ if rxn_flags is None or column not in rxn_flags:
+ return set()
+ mask = rxn_flags[column].map(lambda v: str(v).strip().lower() in ("true", "1"))
+ return set(rxn_flags.loc[mask, "reaction"])
+
+
+def assemble_model_from_ko_genes(
+ reference_model: cobra.Model,
+ ko_reaction: pd.DataFrame,
+ ko_to_genes: dict[str, list[str]],
+ *,
+ rxn_flags: pd.DataFrame | None = None,
+ keep_spontaneous: bool = True,
+ keep_undefined_stoich: bool = True,
+ keep_incomplete: bool = True,
+ keep_general: bool = False,
+ model_id: str | None = None,
+ model_name: str | None = None,
+ note: str | None = None,
+) -> tuple[cobra.Model, dict[str, list[str]]]:
+ """Build a draft model from a ``{ko: [gene, ...]}`` assignment.
+
+ Returns ``(model, gpr_map)`` where ``gpr_map`` is the kept reactions' gene
+ lists, so callers can add gene annotations afterwards.
+ """
+ rxn_to_kos: dict[str, set[str]] = {}
+ for ko, rid in zip(ko_reaction["ko"], ko_reaction["reaction"], strict=True):
+ rxn_to_kos.setdefault(rid, set()).add(ko)
+
+ spontaneous = flag_set(rxn_flags, "spontaneous")
+ drop_if = {
+ "undefined_stoich": (keep_undefined_stoich, flag_set(rxn_flags, "undefined_stoich")),
+ "incomplete": (keep_incomplete, flag_set(rxn_flags, "incomplete")),
+ "general": (keep_general, flag_set(rxn_flags, "general")),
+ }
+
+ gpr_map: dict[str, list[str]] = {}
+ spontaneous_kept: set[str] = set()
+ for rxn in reference_model.reactions:
+ rid = rxn.id
+ # Quality filters first: dropped even if it would have genes.
+ if any(not keep_flag and rid in flagged for keep_flag, flagged in drop_if.values()):
+ continue
+ genes = sorted({g for ko in rxn_to_kos.get(rid, ()) for g in ko_to_genes.get(ko, ())})
+ if genes:
+ gpr_map[rid] = genes
+ elif rid in spontaneous and keep_spontaneous:
+ spontaneous_kept.add(rid)
+
+ keep = set(gpr_map) | spontaneous_kept
+ model = reference_model.copy()
+ if model_id is not None:
+ model.id = model_id
+ if model_name is not None:
+ model.name = model_name
+ model.remove_reactions(
+ [r for r in model.reactions if r.id not in keep], remove_orphans=True
+ )
+ for rid, genes in gpr_map.items():
+ model.reactions.get_by_id(rid).gene_reaction_rule = " or ".join(genes)
+ if note is not None:
+ for rid in keep:
+ model.reactions.get_by_id(rid).notes["note"] = note
+ return model, gpr_map
diff --git a/src/raven_python/reconstruction/kegg/download.py b/src/raven_python/reconstruction/kegg/download.py
new file mode 100644
index 0000000..8bb1826
--- /dev/null
+++ b/src/raven_python/reconstruction/kegg/download.py
@@ -0,0 +1,257 @@
+"""Download and arrange a local KEGG flat-file dump (step 3b.1).
+
+Maintainer-side, build-time tooling. Ports ``fetch_keggdb.sh`` — fetch the KEGG
+FTP source archives, extract them, and lift/concatenate the files that the
+parser (3b.2) and HMM build (3b.3) consume — but as **pure Python stdlib**
+(``urllib`` + ``tarfile`` + ``gzip`` + ``netrc``). That drops the script's
+dependence on ``wget``/``tar``/``gunzip`` (and Cygwin on Windows), so it runs
+unchanged on Linux, macOS and Windows. Credential hygiene is kept: a paid KEGG
+subscription's username/password are read from ``~/.netrc`` (mode 600), never
+passed on the command line.
+
+Requires an active KEGG FTP subscription. Add to ``~/.netrc``::
+
+ machine ftp.kegg.net login YOUR_USER password YOUR_PASS
+
+Typical use (run once per KEGG release)::
+
+ from raven_python.reconstruction.kegg import download_kegg_dump, parse_kegg_dump
+ download_kegg_dump("keggdb") # -> keggdb/{reaction,compound,ko,...}
+ parse_kegg_dump("keggdb", "artefacts") # -> reference model + gzipped TSVs
+
+The arranged dump contains: ``reaction``, ``reaction.lst``,
+``reaction_mapformula.lst``, ``compound`` (compound + glycan concatenated),
+``compound.inchi``, ``ko``, ``genes.pep`` (eukaryote + prokaryote proteomes
+concatenated), and ``taxonomy``.
+"""
+from __future__ import annotations
+
+import gzip
+import netrc
+import shutil
+import tarfile
+import urllib.request
+from pathlib import Path
+
+KEGG_HOST = "ftp.kegg.net"
+BASE_URL = "https://ftp.kegg.net"
+
+# KEGG FTP paths fetched, mirroring fetch_keggdb.sh.
+DEFAULT_FILES: tuple[str, ...] = (
+ "kegg/ligand/reaction.tar.gz",
+ "kegg/ligand/compound.tar.gz",
+ "kegg/ligand/glycan.tar.gz",
+ "kegg/genes/ko.tar.gz",
+ "kegg/genes/fasta/eukaryotes.pep.gz",
+ "kegg/genes/fasta/prokaryotes.pep.gz",
+ "kegg/genes/misc/taxonomy",
+)
+
+
+# --------------------------------------------------------------------------- #
+# Credentials
+# --------------------------------------------------------------------------- #
+def _resolve_auth(
+ host: str,
+ *,
+ netrc_path: str | Path | None = None,
+ auth: tuple[str, str] | None = None,
+) -> tuple[str, str]:
+ """Return ``(user, password)`` for ``host`` from ``auth`` or a ``.netrc`` file."""
+ if auth is not None:
+ return auth
+ path = Path(netrc_path) if netrc_path else Path.home() / ".netrc"
+ if not path.is_file():
+ raise FileNotFoundError(
+ f"No credentials given and {path} does not exist. Create it (chmod 600) "
+ f"with a line:\n machine {host} login YOUR_USER password YOUR_PASS"
+ )
+ creds = netrc.netrc(str(path)).authenticators(host)
+ if not creds:
+ raise ValueError(
+ f"No credentials for '{host}' in {path}. Add a line:\n"
+ f" machine {host} login YOUR_USER password YOUR_PASS"
+ )
+ login, _, password = creds
+ if not login or not password:
+ raise ValueError(f"Incomplete credentials for '{host}' in {path}.")
+ return login, password
+
+
+def _build_opener(base_url: str, user: str, password: str) -> urllib.request.OpenerDirector:
+ mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
+ mgr.add_password(None, base_url, user, password)
+ return urllib.request.build_opener(
+ urllib.request.HTTPBasicAuthHandler(mgr),
+ urllib.request.HTTPDigestAuthHandler(mgr),
+ )
+
+
+# --------------------------------------------------------------------------- #
+# Fetch
+# --------------------------------------------------------------------------- #
+def fetch_kegg_files(
+ dest: str | Path,
+ *,
+ files: tuple[str, ...] = DEFAULT_FILES,
+ base_url: str = BASE_URL,
+ host: str = KEGG_HOST,
+ auth: tuple[str, str] | None = None,
+ netrc_path: str | Path | None = None,
+ force: bool = False,
+ verbose: bool = True,
+) -> list[Path]:
+ """Download the raw KEGG archives into ``dest`` (basenames). Returns the paths.
+
+ Existing files are skipped unless ``force=True`` (the script's ``wget -N``
+ intent, simplified to skip-if-present).
+ """
+ user, password = _resolve_auth(host, netrc_path=netrc_path, auth=auth)
+ opener = _build_opener(base_url, user, password)
+ dest = Path(dest)
+ dest.mkdir(parents=True, exist_ok=True)
+
+ out: list[Path] = []
+ for path in files:
+ target = dest / Path(path).name
+ if target.exists() and not force:
+ if verbose:
+ print(f" skip (exists): {target.name}")
+ out.append(target)
+ continue
+ url = f"{base_url.rstrip('/')}/{path.lstrip('/')}"
+ if verbose:
+ print(f" fetching {path}")
+ with opener.open(url) as resp, open(target, "wb") as handle:
+ shutil.copyfileobj(resp, handle)
+ out.append(target)
+ return out
+
+
+# --------------------------------------------------------------------------- #
+# Extract / arrange
+# --------------------------------------------------------------------------- #
+def _gunzip(src: Path, target: Path) -> None:
+ with gzip.open(src, "rb") as fh, open(target, "wb") as out:
+ shutil.copyfileobj(fh, out)
+
+
+def _concat(sources: list[Path], target: Path) -> None:
+ with open(target, "wb") as out:
+ for src in sources:
+ with open(src, "rb") as fh:
+ shutil.copyfileobj(fh, out)
+
+
+def extract_kegg_dump(dest: str | Path) -> dict[str, Path]:
+ """Extract and arrange the downloaded archives into the flat dump layout.
+
+ Mirrors ``fetch_keggdb.sh``'s extract step: untar the ``*.tar.gz`` archives,
+ gunzip the ``*.pep.gz`` proteomes, lift the needed files out of their
+ sub-directories, and concatenate compound+glycan and the two proteomes.
+ Tar extraction uses the ``data`` filter (no path traversal). Returns a
+ mapping of logical name -> path for the files produced.
+
+ Network-free, so this is the unit-tested core; ``download_kegg_dump`` chains
+ :func:`fetch_kegg_files` in front of it.
+ """
+ dest = Path(dest)
+
+ for tar_path in sorted(dest.glob("*.tar.gz")):
+ with tarfile.open(tar_path) as tar:
+ tar.extractall(dest, filter="data")
+ tar_path.unlink()
+
+ for gz_path in sorted(dest.glob("*.gz")): # only the .pep.gz remain
+ _gunzip(gz_path, gz_path.with_suffix(""))
+ gz_path.unlink()
+
+ def lift(rel: str, tmp: str) -> Path | None:
+ src = dest / rel
+ if src.is_file():
+ shutil.move(str(src), str(dest / tmp))
+ return dest / tmp
+ return None
+
+ reaction = lift("reaction/reaction", "_reaction")
+ lift("reaction/reaction.lst", "reaction.lst")
+ lift("reaction/reaction_mapformula.lst", "reaction_mapformula.lst")
+ compound = lift("compound/compound", "_compound")
+ lift("compound/compound.inchi", "compound.inchi")
+ glycan = lift("glycan/glycan", "_glycan")
+ ko = lift("ko/ko", "_ko")
+
+ for subdir in ("reaction", "compound", "glycan", "ko"):
+ path = dest / subdir
+ if path.is_dir():
+ shutil.rmtree(path)
+
+ missing = [n for n, p in (("reaction", reaction), ("compound", compound), ("ko", ko)) if p is None]
+ if missing:
+ raise FileNotFoundError(
+ f"KEGG archives did not yield required file(s): {missing}. "
+ f"Check that the source .tar.gz archives are present in {dest}."
+ )
+
+ shutil.move(str(reaction), str(dest / "reaction"))
+ shutil.move(str(ko), str(dest / "ko"))
+ if glycan is not None:
+ _concat([compound, glycan], dest / "compound")
+ compound.unlink()
+ glycan.unlink()
+ else:
+ shutil.move(str(compound), str(dest / "compound"))
+
+ peps = [p for p in (dest / "eukaryotes.pep", dest / "prokaryotes.pep") if p.is_file()]
+ if peps:
+ _concat(peps, dest / "genes.pep")
+ for pep in peps:
+ pep.unlink()
+
+ result: dict[str, Path] = {}
+ for name in (
+ "reaction",
+ "reaction.lst",
+ "reaction_mapformula.lst",
+ "compound",
+ "compound.inchi",
+ "ko",
+ "genes.pep",
+ "taxonomy",
+ ):
+ path = dest / name
+ if path.is_file():
+ result[name] = path
+ return result
+
+
+def download_kegg_dump(
+ dest: str | Path,
+ *,
+ files: tuple[str, ...] = DEFAULT_FILES,
+ base_url: str = BASE_URL,
+ host: str = KEGG_HOST,
+ auth: tuple[str, str] | None = None,
+ netrc_path: str | Path | None = None,
+ force: bool = False,
+ verbose: bool = True,
+) -> dict[str, Path]:
+ """Fetch and arrange a complete KEGG dump into ``dest``.
+
+ Convenience wrapper chaining :func:`fetch_kegg_files` and
+ :func:`extract_kegg_dump`. Returns the logical-name -> path mapping of the
+ arranged dump, ready for :func:`raven_python.reconstruction.kegg.parse_kegg_dump`.
+ """
+ fetch_kegg_files(
+ dest,
+ files=files,
+ base_url=base_url,
+ host=host,
+ auth=auth,
+ netrc_path=netrc_path,
+ force=force,
+ verbose=verbose,
+ )
+ if verbose:
+ print(">>> Extracting and arranging KEGG dump...")
+ return extract_kegg_dump(dest)
diff --git a/src/raven_python/reconstruction/kegg/hmm.py b/src/raven_python/reconstruction/kegg/hmm.py
new file mode 100644
index 0000000..0e210b6
--- /dev/null
+++ b/src/raven_python/reconstruction/kegg/hmm.py
@@ -0,0 +1,453 @@
+"""Build per-KO HMM libraries from KEGG sequences (step 3b.3, maintainer-side).
+
+Ports RAVEN ``constructMultiFasta`` plus the clustering/alignment/training stages
+of ``getKEGGModelForOrganism``. Run once per KEGG release to produce the
+``prok90`` / ``euk90`` HMM libraries that the de-novo query path (3b.5) searches.
+
+Per KO, within one domain (prokaryote / eukaryote):
+
+1. **Multi-FASTA** — gather the member genes' sequences from ``genes.pep``
+ (:func:`build_ko_fastas`).
+2. **CD-HIT** — dereplicate near-identical sequences (default 90 % identity).
+3. **MAFFT** — multiple-sequence alignment (``--auto --anysymbol``).
+4. **hmmbuild** — train the profile HMM.
+
+Finally the per-KO HMMs are concatenated and ``hmmpress``-ed into a single searchable
+library: a single ``hmmscan`` against the pressed database replaces a per-KO sweep with
+``hmmsearch``.
+
+The pure parts (FASTA indexing/grouping, command construction, CD-HIT ``-n``
+choice) are unit-tested; running the binaries needs HMMER/MAFFT/CD-HIT, located
+via :func:`raven_python.binaries.resolve_binary`.
+"""
+from __future__ import annotations
+
+import functools
+import logging
+import os
+import shutil
+import subprocess
+import tempfile
+import time
+from pathlib import Path
+
+import pandas as pd
+
+from raven_python.binaries import resolve_binary
+from raven_python.reconstruction.kegg.taxonomy import organisms_in_domain
+
+logger = logging.getLogger(__name__)
+
+
+# --------------------------------------------------------------------------- #
+# Step 1 — per-KO multi-FASTA (constructMultiFasta)
+# --------------------------------------------------------------------------- #
+def _full_id(organism: str, gene: str) -> str:
+ """The genes.pep header key for a gene, i.e. ``organism:gene``."""
+ return f"{organism}:{gene}"
+
+
+def _index_fasta(path: str | Path, wanted: set[str]) -> dict[str, tuple[int, int]]:
+ """Map each wanted record id to its ``(start, end)`` byte span in ``path``.
+
+ The record id is the first whitespace-delimited token of the ``>`` header.
+ One streaming pass; only wanted ids are kept (memory stays small).
+ """
+ index: dict[str, tuple[int, int]] = {}
+ cur_id: str | None = None
+ cur_start = 0
+ pos = 0
+ with open(path, "rb") as handle:
+ for line in handle:
+ if line.startswith(b">"):
+ if cur_id is not None and cur_id in wanted:
+ index[cur_id] = (cur_start, pos)
+ cur_id = line[1:].split(None, 1)[0].decode()
+ cur_start = pos
+ pos += len(line)
+ if cur_id is not None and cur_id in wanted:
+ index[cur_id] = (cur_start, pos)
+ return index
+
+
+def build_ko_fastas(
+ organism_gene_ko: pd.DataFrame,
+ genes_pep: str | Path,
+ out_dir: str | Path,
+ *,
+ organisms: set[str] | None = None,
+) -> dict[str, Path]:
+ """Write one ``.fa`` per KO with its member genes' sequences.
+
+ but with a stdlib offset index instead
+ of the Java-hashtable byte scan. ``organisms`` restricts to a domain's
+ organism codes (for the prok/euk split). Empty KOs are skipped (no file).
+ Returns ``{ko: path}`` for the files written.
+ """
+ out_dir = Path(out_dir)
+ out_dir.mkdir(parents=True, exist_ok=True)
+
+ rows = organism_gene_ko
+ if organisms is not None:
+ rows = rows[rows["organism"].isin(organisms)]
+
+ ko_to_ids: dict[str, list[str]] = {}
+ wanted: set[str] = set()
+ for organism, gene, ko in zip(rows["organism"], rows["gene"], rows["ko"], strict=True):
+ fid = _full_id(organism, gene)
+ ko_to_ids.setdefault(ko, []).append(fid)
+ wanted.add(fid)
+
+ index = _index_fasta(genes_pep, wanted)
+
+ written: dict[str, Path] = {}
+ with open(genes_pep, "rb") as src:
+ for ko, ids in ko_to_ids.items():
+ present = sorted({i for i in ids if i in index})
+ if not present:
+ continue
+ path = out_dir / f"{ko}.fa"
+ with open(path, "wb") as out:
+ for fid in present:
+ start, end = index[fid]
+ src.seek(start)
+ out.write(src.read(end - start))
+ written[ko] = path
+ return written
+
+
+# --------------------------------------------------------------------------- #
+# Steps 2-4 — cluster, align, train (one KO)
+# --------------------------------------------------------------------------- #
+def _cdhit_word_size(seq_identity: float) -> str:
+ """CD-HIT ``-n`` word size for a given identity threshold (per CD-HIT guide)."""
+ if not 0.4 < seq_identity <= 1.0:
+ raise ValueError("seq_identity must be in (0.4, 1.0] (or -1 to skip CD-HIT).")
+ if seq_identity > 0.7:
+ return "5"
+ if seq_identity > 0.6:
+ return "4"
+ if seq_identity > 0.5:
+ return "3"
+ return "2"
+
+
+def _count_sequences(fasta: Path) -> int:
+ with open(fasta, "rb") as fh:
+ return sum(1 for line in fh if line.startswith(b">"))
+
+
+def _fasta_stats(fasta: Path) -> tuple[int, int]:
+ """Return ``(sequence_count, total_residues)`` in one pass."""
+ n = residues = 0
+ with open(fasta, "rb") as fh:
+ for line in fh:
+ if line.startswith(b">"):
+ n += 1
+ else:
+ residues += len(line.strip())
+ return n, residues
+
+
+def _cdhit_cmd(cdhit: str, inp: Path, out: Path, seq_identity: float, threads: int) -> list[str]:
+ return [
+ cdhit, "-i", str(inp), "-o", str(out),
+ "-c", str(seq_identity), "-n", _cdhit_word_size(seq_identity),
+ "-M", "2000", "-T", str(threads),
+ ]
+
+
+# MAFFT uses fast progressive FFT-NS-2 until an alignment is large enough to
+# threaten memory, then switches to memory-light PartTree (which keeps all
+# sequences; only the guide tree is approximated).
+#
+# Peak FFT-NS-2 RSS is driven by the progressive-alignment DP work, ~ n_seqs ×
+# (mean length)^2 (equivalently residues^2 / n_seqs) — NOT residue count alone:
+# a few hundred long proteins cost far more than the same residues spread over
+# many short ones. Empirical fit (real KEGG sequences, 12 threads):
+# RSS_GB ≈ _MAFFT_GB_PER_COST × (n_seqs × mean_len^2)
+# Measured (residues, n_seqs, RSS): 250k/266/0.67, 500k/534/1.25, 1.0M/1066/3.16,
+# 1.5M/1624/5.73, and K12047 941k/452 (mean len 2082) which OOM'd >7 GB — its
+# cost 1.96e9 is the largest of all, hence the length-aware metric.
+_MAFFT_GB_PER_COST = 4.2e-9 # GB per unit of (n_seqs × mean_len^2); conservative upper bound
+_MAFFT_MEMORY_OVERHEAD_GB = 2.5 # RAM not for MAFFT (OS + WSL2 + Python); WSL total overcounts
+_MEMORY_SAFETY = 0.65 # leave headroom; never budget MAFFT to the brink
+_DEFAULT_COST_BUDGET = 5e8 # fallback DP-cost budget when total memory can't be detected
+_LOW_MEMORY_BYTES = 16 * 1024**3 # below this, warn that the budget is conservative
+
+
+def _total_memory_bytes() -> int | None:
+ try:
+ return os.sysconf("SC_PHYS_PAGES") * os.sysconf("SC_PAGE_SIZE")
+ except (AttributeError, ValueError, OSError):
+ return None
+
+
+def _alignment_cost(n_seqs: int, residues: int) -> float:
+ """FFT-NS-2 memory proxy: ``n_seqs × mean_len^2`` = ``residues^2 / n_seqs``."""
+ return residues * residues / n_seqs if n_seqs else 0.0
+
+
+@functools.lru_cache(maxsize=1)
+def _auto_cost_budget() -> float:
+ """Max FFT-NS-2 DP-cost (``n_seqs × mean_len^2``) before switching to PartTree.
+
+ Derived from available RAM via the measured memory model; above it, an
+ alignment is predicted to exceed a safe fraction of the RAM left for MAFFT.
+ Computed and logged once; warns on low-memory hosts (more KOs then use the
+ approximate PartTree).
+ """
+ total = _total_memory_bytes()
+ if total is None:
+ logger.warning(
+ "Could not detect system memory; using default MAFFT cost budget %.2e. "
+ "Pass parttree_residues to override.", _DEFAULT_COST_BUDGET,
+ )
+ return _DEFAULT_COST_BUDGET
+ total_gb = total / 1024**3
+ mafft_gb = max(total_gb - _MAFFT_MEMORY_OVERHEAD_GB, 0.5)
+ budget = _MEMORY_SAFETY * mafft_gb / _MAFFT_GB_PER_COST
+ logger.info(
+ "MAFFT DP-cost budget %.2e auto-set from %.1f GB RAM (~%.1f GB for MAFFT)",
+ budget, total_gb, mafft_gb,
+ )
+ if total < _LOW_MEMORY_BYTES:
+ logger.warning(
+ "Limited memory (%.1f GB total): MAFFT cost budget set conservatively to "
+ "%.2e, so more (especially long-protein) KOs use the approximate PartTree "
+ "alignment. With more RAM, fewer would.", total_gb, budget,
+ )
+ return budget
+
+
+def _mafft_cmd(
+ mafft: str, inp: Path, threads: int, *, fast: bool = True, parttree: bool = False
+) -> list[str]:
+ """Build the MAFFT command.
+
+ ``fast`` selects FFT-NS-2 (``--retree 2 --maxiterate 0``) — fast progressive
+ alignment, the right trade-off for building profile HMMs — instead of
+ ``--auto`` (which picks slow iterative refinement on medium/large inputs).
+ ``parttree`` adds MAFFT's PartTree approximation for very large inputs.
+ """
+ cmd = [mafft]
+ if parttree:
+ cmd += ["--retree", "2", "--parttree"]
+ elif fast:
+ cmd += ["--retree", "2", "--maxiterate", "0"]
+ else:
+ cmd += ["--auto"]
+ cmd += ["--anysymbol", "--thread", str(threads), str(inp)]
+ return cmd
+
+
+def _hmmbuild_cmd(
+ hmmbuild: str, out_hmm: Path, aligned: Path, threads: int, name: str | None = None
+) -> list[str]:
+ cmd = [hmmbuild, "--cpu", str(threads)]
+ if name: # name the profile after its KO so hmmscan targets are KO ids
+ cmd += ["-n", name]
+ cmd += [str(out_hmm), str(aligned)]
+ return cmd
+
+
+def _run(cmd: list[str], *, stdout_path: Path | None = None) -> str:
+ """Run a command; optionally redirect stdout to a file. Raises on failure."""
+ if stdout_path is not None:
+ with open(stdout_path, "w") as out:
+ proc = subprocess.run(cmd, stdout=out, stderr=subprocess.PIPE, text=True)
+ stderr = proc.stderr or ""
+ else:
+ proc = subprocess.run(cmd, capture_output=True, text=True)
+ stderr = proc.stderr or ""
+ if proc.returncode != 0:
+ raise RuntimeError(f"{Path(cmd[0]).name} failed:\n{stderr.strip()}")
+ return stderr
+
+
+def _staged_run(
+ cmd: list[str], *, label: str, stage: str, verbose: bool,
+ stdout_path: Path | None = None, log: bool = True,
+) -> float:
+ """Run a stage's command; log one completion line per stage (when verbose).
+
+ At INFO (when ``log``): a single ``[KO] stage: done in X.Xs`` line — the
+ ``stage`` descriptor already names the tool/mode and any seq/res/cost context,
+ so the timing is just appended rather than repeated on a second line. The
+ tool's own stderr (MAFFT/CD-HIT/hmmbuild progress) is logged at DEBUG. Pass
+ ``log=False`` to suppress the line so the caller can fold the timing into its
+ own message. Returns the stage's wall-clock seconds.
+ """
+ start = time.perf_counter()
+ stderr = _run(cmd, stdout_path=stdout_path)
+ elapsed = time.perf_counter() - start
+ if verbose:
+ if log:
+ logger.info("[%s] %s: done in %.1fs", label, stage, elapsed)
+ if stderr.strip():
+ logger.debug("[%s] %s output:\n%s", label, stage, stderr.strip())
+ return elapsed
+
+
+def build_ko_hmm(
+ ko_fasta: str | Path,
+ out_hmm: str | Path,
+ *,
+ seq_identity: float = 0.9,
+ parttree_residues: int | None = None,
+ threads: int = 1,
+ fast: bool = True,
+ verbose: bool = False,
+ cdhit: str | Path | None = None,
+ mafft: str | Path | None = None,
+ hmmbuild: str | Path | None = None,
+) -> Path:
+ """Cluster, align and train a profile HMM for one KO's multi-FASTA.
+
+ Single-sequence KOs skip CD-HIT/MAFFT (a lone sequence is its own alignment).
+ ``seq_identity=-1`` skips CD-HIT. All (deduplicated) sequences are kept —
+ memory on large KOs is bounded by switching MAFFT to PartTree, not by
+ dropping sequences. ``fast`` uses MAFFT FFT-NS-2 (fast progressive) rather
+ than ``--auto``'s slow iterative refinement. MAFFT switches to memory-light
+ PartTree once an alignment is predicted to be too memory-heavy: by default from
+ its **DP cost** (``n_seqs × mean_len²`` — long proteins cost far more than the
+ same residue count in short ones) against a RAM-derived budget
+ (:func:`_auto_cost_budget`). Passing ``parttree_residues`` overrides this with a
+ simple residue-count cutoff.
+ ``verbose`` logs (via the ``logging`` module, INFO/DEBUG) which tool is running
+ for this KO, sequence counts at each stage, timings, and the tools' own
+ output. Returns ``out_hmm``.
+ """
+ ko_fasta = Path(ko_fasta)
+ out_hmm = Path(out_hmm)
+ label = out_hmm.stem
+ out_hmm.parent.mkdir(parents=True, exist_ok=True)
+ n = _count_sequences(ko_fasta)
+ if n == 0:
+ raise ValueError(f"{ko_fasta} contains no sequences.")
+ if verbose:
+ logger.info("[%s] start: %d sequences", label, n)
+
+ hmmbuild = resolve_binary("hmmbuild", binary=hmmbuild)
+ with tempfile.TemporaryDirectory() as tmp:
+ tmp = Path(tmp)
+ if n == 1:
+ if verbose:
+ logger.info("[%s] single sequence: skipping CD-HIT/MAFFT", label)
+ aligned = ko_fasta # trivially aligned
+ else:
+ clustered = ko_fasta
+ cdhit_elapsed: float | None = None
+ if seq_identity != -1:
+ clustered = tmp / "clustered.fa"
+ cdhit_elapsed = _staged_run(
+ _cdhit_cmd(
+ resolve_binary("cd-hit", binary=cdhit), ko_fasta, clustered,
+ seq_identity, threads,
+ ),
+ label=label, stage=f"CD-HIT ({seq_identity})", verbose=verbose, log=False,
+ )
+ n_clustered, residues = _fasta_stats(clustered)
+ if verbose and cdhit_elapsed is not None:
+ logger.info(
+ "[%s] CD-HIT (%s): %d -> %d sequences in %.1fs",
+ label, seq_identity, n, n_clustered, cdhit_elapsed,
+ )
+ aligned = tmp / "aligned.fa"
+ if n_clustered == 1:
+ if verbose:
+ logger.info("[%s] one sequence after CD-HIT: skipping MAFFT", label)
+ shutil.copyfile(clustered, aligned) # MAFFT can't align a single seq
+ else:
+ # PartTree once the alignment is too memory-heavy. Default: its DP
+ # cost (n_seqs × mean_len^2) vs a RAM-derived budget — length-aware,
+ # so long-protein KOs (few seqs, huge residues) route correctly.
+ # parttree_residues, if given, overrides with a residue-count cutoff.
+ cost = _alignment_cost(n_clustered, residues)
+ if parttree_residues is None:
+ parttree = cost > _auto_cost_budget()
+ else:
+ parttree = residues > parttree_residues
+ _staged_run(
+ _mafft_cmd(
+ resolve_binary("mafft", binary=mafft), clustered, threads,
+ fast=fast, parttree=parttree,
+ ),
+ label=label,
+ stage=f"MAFFT {'PartTree' if parttree else 'FFT-NS-2' if fast else 'auto'} "
+ f"({n_clustered} seqs, {residues} res, cost {cost:.2e})",
+ verbose=verbose,
+ stdout_path=aligned,
+ )
+ _staged_run(
+ _hmmbuild_cmd(hmmbuild, out_hmm, aligned, threads, name=label),
+ label=label, stage="hmmbuild", verbose=verbose,
+ )
+ if verbose:
+ logger.info("[%s] complete -> %s", label, out_hmm)
+ return out_hmm
+
+
+# --------------------------------------------------------------------------- #
+# Orchestration — a full domain library
+# --------------------------------------------------------------------------- #
+def build_hmm_library(
+ organism_gene_ko: pd.DataFrame,
+ genes_pep: str | Path,
+ taxonomy: str | Path,
+ out_dir: str | Path,
+ *,
+ domain: str,
+ seq_identity: float = 0.9,
+ parttree_residues: int | None = None,
+ threads: int = 1,
+ fast: bool = True,
+ verbose: bool = False,
+ press: bool = True,
+ cdhit: str | Path | None = None,
+ mafft: str | Path | None = None,
+ hmmbuild: str | Path | None = None,
+ hmmpress: str | Path | None = None,
+) -> dict[str, Path | list[Path]]:
+ """Build a domain (``"prokaryotes"``/``"eukaryotes"``) HMM library.
+
+ Restricts genes to the domain's organisms (from ``taxonomy``), builds a
+ multi-FASTA and a profile HMM per KO under ``out_dir``, and (if ``press``)
+ concatenates them into ``out_dir/library.hmm`` and ``hmmpress``-es it for fast
+ ``hmmscan`` querying. Returns ``{"hmms": [...], "library": path | None}``.
+
+ Heavy and binary-dependent — intended for the maintainer, run once per KEGG
+ release. Skips KOs that already have an ``.hmm`` (resumable).
+ """
+ out_dir = Path(out_dir)
+ fasta_dir = out_dir / "fasta"
+ hmm_dir = out_dir / "hmms"
+ hmm_dir.mkdir(parents=True, exist_ok=True)
+
+ organisms = organisms_in_domain(taxonomy, domain)
+ if not organisms:
+ raise ValueError(f"No organisms found for domain {domain!r} in {taxonomy}.")
+
+ ko_fastas = build_ko_fastas(organism_gene_ko, genes_pep, fasta_dir, organisms=organisms)
+
+ hmms: list[Path] = []
+ for ko, fasta in ko_fastas.items():
+ out_hmm = hmm_dir / f"{ko}.hmm"
+ if not out_hmm.exists():
+ build_ko_hmm(
+ fasta, out_hmm, seq_identity=seq_identity,
+ parttree_residues=parttree_residues, threads=threads, fast=fast,
+ verbose=verbose, cdhit=cdhit, mafft=mafft, hmmbuild=hmmbuild,
+ )
+ hmms.append(out_hmm)
+
+ library: Path | None = None
+ if press and hmms:
+ library = out_dir / "library.hmm"
+ with open(library, "wb") as out:
+ for hmm in sorted(hmms):
+ with open(hmm, "rb") as fh:
+ shutil.copyfileobj(fh, out)
+ _run([resolve_binary("hmmpress", binary=hmmpress), "-f", str(library)])
+
+ return {"hmms": hmms, "library": library}
diff --git a/src/raven_python/reconstruction/kegg/organism.py b/src/raven_python/reconstruction/kegg/organism.py
new file mode 100644
index 0000000..9f30575
--- /dev/null
+++ b/src/raven_python/reconstruction/kegg/organism.py
@@ -0,0 +1,153 @@
+"""Build a draft model for a KEGG species from the reference artefacts (step 3b.4).
+
+Ports the **organism-ID** path of RAVEN ``getKEGGModelForOrganism`` (the branch
+taken when no FASTA file is given). For an organism already annotated in KEGG it
+needs no homology search: take the organism's gene↔KO assignments, map KO→reaction
+against the gene-free reference model, OR-join the organism's genes into each
+reaction's GPR, and keep the reactions that end up with genes (plus spontaneous
+reactions, optionally). The HMM/FASTA path is step 3b.5 (:mod:`.query`).
+
+Consumes the 3b.2 artefacts: the gene-free reference ``cobra.Model`` plus the
+``ko_reaction``, ``organism_gene_ko`` and ``rxn_flags`` tables. The KO→reaction
+mapping is taken from the ``ko_reaction`` table (a lossless published artefact)
+rather than from the reference model's annotations, so it does not depend on KEGG
+annotations surviving an SBML round-trip.
+
+Domain mode (``organism_id`` = ``"eukaryotes"``/``"prokaryotes"``) keeps the genes
+of every organism in that domain; it needs the KEGG ``taxonomy`` file. Unlike
+RAVEN, this uses the domain classification directly rather than the full
+``getPhylDist`` distance matrix — the matrix existed for per-organism HMM
+subsampling, which our fixed prok90/euk90 libraries (3b.3) make unnecessary.
+"""
+from __future__ import annotations
+
+from pathlib import Path
+
+import cobra
+import pandas as pd
+
+from raven_python.io.yaml import read_yaml_model
+from raven_python.reconstruction.kegg.assemble import _DOMAINS, assemble_model_from_ko_genes
+from raven_python.reconstruction.kegg.parse import read_kegg_table
+from raven_python.reconstruction.kegg.taxonomy import organisms_in_domain
+
+_NOTE = "Included by get_kegg_model_for_organism (no HMMs)"
+
+
+def get_kegg_model_for_organism(
+ organism_id: str,
+ reference_model: cobra.Model,
+ ko_reaction: pd.DataFrame,
+ organism_gene_ko: pd.DataFrame,
+ *,
+ rxn_flags: pd.DataFrame | None = None,
+ taxonomy: str | Path | None = None,
+ keep_spontaneous: bool = True,
+ keep_undefined_stoich: bool = True,
+ keep_incomplete: bool = True,
+ keep_general: bool = False,
+) -> cobra.Model:
+ """Reconstruct a draft model for a KEGG species from its KO annotations.
+
+ Parameters
+ ----------
+ organism_id
+ Three/four-letter KEGG organism code (e.g. ``"eco"``), or
+ ``"eukaryotes"``/``"prokaryotes"`` for a whole-domain model (requires
+ ``taxonomy``). Matched case-insensitively.
+ reference_model
+ The gene-free KEGG reference model (from :func:`build_reference_model`).
+ ko_reaction, organism_gene_ko, rxn_flags
+ The relational tables from :func:`build_kegg_tables` (or read back with
+ :func:`read_kegg_table`).
+ taxonomy
+ Path to the KEGG ``taxonomy`` file; required only for domain mode.
+ keep_spontaneous, keep_undefined_stoich, keep_incomplete, keep_general
+ Quality filters (RAVEN's ``keep*``). A reaction flagged in ``rxn_flags``
+ is dropped unless its keep flag is set; this takes precedence over having
+ genes. Spontaneous reactions are additionally kept *without* genes when
+ ``keep_spontaneous`` is true.
+
+ Returns
+ -------
+ cobra.Model
+ A copy of the reference restricted to the organism's reactions, with GPRs
+ built and ``kegg.genes`` annotations on the genes.
+ """
+ org = organism_id.lower()
+ if org in _DOMAINS:
+ if taxonomy is None:
+ raise ValueError(
+ f"Domain mode ({organism_id!r}) needs the KEGG taxonomy file; "
+ "pass taxonomy=."
+ )
+ members = organisms_in_domain(taxonomy, org)
+ rows = organism_gene_ko[organism_gene_ko["organism"].str.lower().isin(members)]
+ else:
+ known = set(organism_gene_ko["organism"].str.lower())
+ if org not in known:
+ raise ValueError(
+ f"Organism '{organism_id}' has no genes in organism_gene_ko. "
+ f"Provide a KEGG species code present in the table."
+ )
+ rows = organism_gene_ko[organism_gene_ko["organism"].str.lower() == org]
+
+ ko_to_genes: dict[str, list[str]] = {}
+ for org_code, gene, ko in zip(rows["organism"], rows["gene"], rows["ko"], strict=True):
+ # In domain mode genes from different organisms can share a bare id;
+ # qualify with the organism so they stay distinct.
+ gene_id = gene if org not in _DOMAINS else f"{org_code.lower()}:{gene}"
+ ko_to_genes.setdefault(ko, []).append(gene_id)
+
+ model, _ = assemble_model_from_ko_genes(
+ reference_model,
+ ko_reaction,
+ ko_to_genes,
+ rxn_flags=rxn_flags,
+ keep_spontaneous=keep_spontaneous,
+ keep_undefined_stoich=keep_undefined_stoich,
+ keep_incomplete=keep_incomplete,
+ keep_general=keep_general,
+ model_id=organism_id,
+ model_name=f"Generated by get_kegg_model_for_organism for {organism_id}",
+ note=_NOTE,
+ )
+ for gene in model.genes:
+ # Species mode: bare gene id -> organism:gene. Domain mode: already
+ # organism-qualified.
+ value = gene.id if ":" in gene.id else f"{org}:{gene.id}"
+ gene.annotation["kegg.genes"] = value
+ return model
+
+
+def get_kegg_model_for_organism_from_artefacts(
+ organism_id: str,
+ artefact_dir: str | Path | None = None,
+ *,
+ version: str | None = None,
+ **kwargs,
+) -> cobra.Model:
+ """Load the published 3b.2 artefacts from ``artefact_dir`` and build the model.
+
+ Reads ``reference_model.yml.gz`` and the ``ko_reaction``/``organism_gene_ko``/
+ ``rxn_flags`` gzipped-TSV tables, then calls :func:`get_kegg_model_for_organism`.
+ If ``artefact_dir`` is ``None`` the published artefacts are fetched/cached via
+ :func:`raven_python.data.ensure_kegg_data` (``version`` selects the release).
+ """
+ if artefact_dir is None:
+ from raven_python.data import ensure_kegg_data
+
+ artefact_dir = ensure_kegg_data(version=version)
+ artefact_dir = Path(artefact_dir)
+ reference_model = read_yaml_model(artefact_dir / "reference_model.yml.gz")
+ ko_reaction = read_kegg_table(artefact_dir / "ko_reaction.tsv.gz")
+ organism_gene_ko = read_kegg_table(artefact_dir / "organism_gene_ko.tsv.xz")
+ rxn_flags = read_kegg_table(artefact_dir / "rxn_flags.tsv.gz")
+ return get_kegg_model_for_organism(
+ organism_id,
+ reference_model,
+ ko_reaction,
+ organism_gene_ko,
+ rxn_flags=rxn_flags,
+ **kwargs,
+ )
diff --git a/src/raven_python/reconstruction/kegg/parse.py b/src/raven_python/reconstruction/kegg/parse.py
new file mode 100644
index 0000000..3ecd6f4
--- /dev/null
+++ b/src/raven_python/reconstruction/kegg/parse.py
@@ -0,0 +1,578 @@
+"""Parse a local KEGG flat-file dump into a reference model + relational tables.
+
+Maintainer-side, build-time tooling. Produces the published raven_python KEGG artefacts:
+
+* a **gene-free reference GEM** (reactions + metabolites only) as a ``cobra.Model``;
+* minimal **relational tables** (``pandas.DataFrame``) written as gzipped TSV —
+ ``ko_reaction``, ``ko_names``, ``organism_gene_ko`` (the large one), and
+ ``rxn_flags`` (spontaneous / undefined-stoich / incomplete / general).
+
+Genes live only in ``organism_gene_ko``; per-organism GPRs are built at runtime
+(3b.4/3b.5), so the reference model stays small.
+
+Improvements over the RAVEN port (logged in IMPROVEMENTS.md):
+
+* **K1** — equations are read from each reaction entry's own ``EQUATION`` field,
+ dropping RAVEN's fragile dependence on ``reaction.lst`` being in the exact same
+ line order as ``reaction``.
+* **K2** — undefined-stoichiometry terms (``n C00001``, ``(n+1) C00002``) keep
+ their real compound id with coefficient 1 and the reaction is *flagged*, rather
+ than minting ``"n C00001"`` pseudo-metabolites and renaming them ``undefined_N``.
+* **K3** — quality labels become a tidy boolean ``rxn_flags`` table instead of
+ free-text appended to ``rxnNotes``.
+
+The KEGG flat-file format: each entry is a block of lines terminated by ``///``;
+a field label occupies columns 1-12, continuation lines are indented 12 spaces.
+"""
+from __future__ import annotations
+
+import gzip
+import heapq
+import lzma
+import re
+import tempfile
+from collections.abc import Iterator
+from dataclasses import dataclass, field
+from pathlib import Path
+
+import cobra
+import pandas as pd
+
+from raven_python.io.yaml import write_yaml_model
+
+# A KEGG entry id is the first token after the 12-char ENTRY label (6 chars:
+# R00010, C00001, K01194, ...).
+_ID_LEN = 6
+_LABEL_WIDTH = 12
+
+# Compound token inside an equation, optionally a glycan (G) or drug (D); we also
+# tolerate trailing polymer suffixes like "C00404(n)" by matching the stem.
+_MET_TOKEN = re.compile(r"^([CGD]\d{5})")
+_NUMERIC = re.compile(r"^\d+(\.\d+)?$")
+
+
+# --------------------------------------------------------------------------- #
+# Generic flat-file reader
+# --------------------------------------------------------------------------- #
+def _iter_entries(path: str | Path) -> Iterator[dict[str, list[str]]]:
+ """Yield one ``{field_label: [value_lines]}`` dict per ``///``-delimited entry.
+
+ Field labels (columns 1-12) key a list of their value lines in file order;
+ continuation lines (12 leading spaces) append to the current field.
+ """
+ entry: dict[str, list[str]] = {}
+ current: str | None = None
+ with open(path, encoding="utf-8") as handle:
+ for raw in handle:
+ line = raw.rstrip("\n")
+ if line.startswith("///"):
+ if entry:
+ yield entry
+ entry, current = {}, None
+ continue
+ if not line.strip():
+ continue
+ label = line[:_LABEL_WIDTH].strip()
+ value = line[_LABEL_WIDTH:].rstrip()
+ if label:
+ current = label
+ entry.setdefault(current, []).append(value)
+ elif current is not None:
+ entry[current].append(value)
+ if entry: # tolerate a missing final '///'
+ yield entry
+
+
+# --------------------------------------------------------------------------- #
+# Reactions
+# --------------------------------------------------------------------------- #
+@dataclass
+class KeggReaction:
+ """A reaction parsed from the KEGG ``reaction`` flat file."""
+
+ id: str
+ name: str = ""
+ equation: str = ""
+ reversible: bool = True
+ eccodes: list[str] = field(default_factory=list)
+ kos: list[str] = field(default_factory=list)
+ pathways: list[str] = field(default_factory=list)
+ spontaneous: bool = False
+ incomplete: bool = False
+ general: bool = False
+ undefined_stoich: bool = False
+ # Cached stoichiometry from ``_parse_equation(equation)``: populated by
+ # :func:`parse_kegg_reactions` so :func:`build_reference_model` reuses the
+ # parse instead of repeating it (KEGG has ~12k reactions; a full redundant
+ # parse cost a noticeable chunk of the build).
+ stoichiometry: dict[str, float] = field(default_factory=dict)
+
+
+def _first_id(lines: list[str]) -> str:
+ return lines[0][:_ID_LEN].strip() if lines else ""
+
+
+def _comment_flags(rxn: KeggReaction, comment: str) -> None:
+ text = comment.upper()
+ rxn.spontaneous = "SPONTANEOUS" in text
+ rxn.incomplete = any(w in text for w in ("INCOMPLETE", "ERRONEOUS", "UNCLEAR"))
+ rxn.general = "GENERAL REACTION" in text
+
+
+def _parse_equation(equation: str) -> tuple[dict[str, float], bool, bool]:
+ """Parse a KEGG equation into ``({met_id: coef}, reversible, undefined_stoich)``.
+
+ Reactants get negative coefficients, products positive. Non-numeric
+ coefficients (``n``, ``(n+1)``, ``2n``) are treated as 1.0 and flag the
+ reaction as having undefined stoichiometry (improvement K2).
+ """
+ reversible = "<=>" in equation
+ parts = re.split(r"\s(?:<=>|=>|<=)\s", equation, maxsplit=1)
+ lhs, rhs = (parts + ["", ""])[:2]
+
+ stoich: dict[str, float] = {}
+ undefined = False
+ for side, sign in ((lhs, -1.0), (rhs, 1.0)):
+ for term in filter(None, (t.strip() for t in side.split(" + "))):
+ tokens = term.split()
+ met_token = tokens[-1]
+ coef_tokens = tokens[:-1]
+ if coef_tokens and _NUMERIC.match(coef_tokens[0]):
+ coef = float(coef_tokens[0])
+ else:
+ coef = 1.0
+ if coef_tokens: # a symbolic coefficient like 'n' or '(n+1)'
+ undefined = True
+ match = _MET_TOKEN.match(met_token)
+ if not match: # unparseable term -> flag, keep raw token
+ undefined = True
+ met_id = met_token
+ else:
+ met_id = match.group(1)
+ stoich[met_id] = stoich.get(met_id, 0.0) + sign * coef
+ # Drop metabolites that cancel out (A <=> A + B leaves A at 0).
+ stoich = {m: c for m, c in stoich.items() if c != 0.0}
+ return stoich, reversible, undefined
+
+
+def parse_kegg_reactions(kegg_dir: str | Path) -> list[KeggReaction]:
+ """Parse ``/reaction`` into :class:`KeggReaction` records.
+
+ Reversibility is taken from the equation arrow and, when
+ ``reaction_mapformula.lst`` is present, refined to mark reactions that are
+ irreversible across all KEGG maps (see :func:`_irreversible_from_mapformula`).
+ """
+ kegg_dir = Path(kegg_dir)
+ reactions: list[KeggReaction] = []
+ for entry in _iter_entries(kegg_dir / "reaction"):
+ rxn = KeggReaction(id=_first_id(entry.get("ENTRY", [])))
+ if not rxn.id:
+ continue
+ if entry.get("NAME"):
+ rxn.name = entry["NAME"][0].rstrip(";").strip()
+ if entry.get("COMMENT"):
+ _comment_flags(rxn, " ".join(entry["COMMENT"]))
+ if entry.get("ENZYME"):
+ rxn.eccodes = [ec for line in entry["ENZYME"] for ec in line.split()]
+ rxn.kos = [line[:_ID_LEN].strip() for line in entry.get("ORTHOLOGY", [])]
+ for line in entry.get("PATHWAY", []):
+ pid = line[:7].strip()
+ if pid and not pid.startswith(("rn011", "rn012")): # skip global/overview
+ rxn.pathways.append(pid)
+ if entry.get("EQUATION"):
+ rxn.equation = " ".join(s.strip() for s in entry["EQUATION"])
+ stoich, rxn.reversible, rxn.undefined_stoich = _parse_equation(rxn.equation)
+ rxn.stoichiometry = stoich # cached for build_reference_model
+ reactions.append(rxn)
+
+ irrev = _irreversible_from_mapformula(kegg_dir / "reaction_mapformula.lst")
+ for rxn in reactions:
+ if rxn.id in irrev:
+ rxn.reversible = False
+ return reactions
+
+
+def _irreversible_from_mapformula(path: str | Path) -> set[str]:
+ """Reaction ids that are irreversible in *every* KEGG map they appear in.
+
+ ``reaction_mapformula.lst`` lines look like ``R00005: 00330: C01010 => C00011``.
+ A reaction is considered irreversible only if no map lists it as ``<=>`` and
+ every map draws it in the same direction. Direction (substrate/product order)
+ is not propagated back into the model stoichiometry — a documented
+ simplification of RAVEN's column-flipping logic, which only affects the small
+ set of map-directional reactions.
+ """
+ path = Path(path)
+ if not path.is_file():
+ return set()
+ seen_reversible: set[str] = set()
+ products: dict[str, str] = {}
+ conflicting: set[str] = set()
+ for entry in _iter_mapformula_lines(path):
+ rid, reversible, product = entry
+ if reversible:
+ seen_reversible.add(rid)
+ elif rid in products and products[rid] != product:
+ conflicting.add(rid) # drawn both directions across maps -> reversible
+ else:
+ products.setdefault(rid, product)
+ return {rid for rid in products if rid not in seen_reversible and rid not in conflicting}
+
+
+def _iter_mapformula_lines(path: Path) -> Iterator[tuple[str, bool, str]]:
+ with open(path, encoding="utf-8") as handle:
+ for raw in handle:
+ line = raw.strip()
+ if not line or ":" not in line:
+ continue
+ rid = line[:_ID_LEN]
+ reversible = "<=>" in line
+ product = line.split()[-1]
+ yield rid, reversible, product
+
+
+# --------------------------------------------------------------------------- #
+# Compounds
+# --------------------------------------------------------------------------- #
+@dataclass
+class KeggCompound:
+ """A metabolite parsed from the KEGG ``compound`` flat file."""
+
+ id: str
+ name: str = ""
+ formula: str = ""
+ inchi: str = ""
+ chebi: list[str] = field(default_factory=list)
+ pubchem: list[str] = field(default_factory=list)
+
+
+def parse_kegg_compounds(kegg_dir: str | Path) -> list[KeggCompound]:
+ """Parse ``/compound`` (+ optional ``compound.inchi``) into records."""
+ kegg_dir = Path(kegg_dir)
+ compounds: list[KeggCompound] = []
+ for entry in _iter_entries(kegg_dir / "compound"):
+ cid = _first_id(entry.get("ENTRY", []))
+ if not cid:
+ continue
+ cmp = KeggCompound(id=cid)
+ if entry.get("NAME"):
+ # Only the first synonym; KEGG separates them with ';'.
+ cmp.name = entry["NAME"][0].split(";")[0].strip()
+ if entry.get("FORMULA"):
+ cmp.formula = entry["FORMULA"][0].strip()
+ for line in entry.get("DBLINKS", []):
+ if line.startswith("ChEBI:"):
+ cmp.chebi += [f"CHEBI:{x}" for x in line.split(":", 1)[1].split()]
+ elif line.startswith("PubChem:"):
+ cmp.pubchem += line.split(":", 1)[1].split()
+ compounds.append(cmp)
+
+ inchis = _parse_inchis(kegg_dir / "compound.inchi")
+ for cmp in compounds:
+ if cmp.id in inchis:
+ cmp.inchi = inchis[cmp.id]
+ cmp.formula = "" # prefer the InChI; matches RAVEN
+ return compounds
+
+
+def _parse_inchis(path: str | Path) -> dict[str, str]:
+ path = Path(path)
+ if not path.is_file():
+ return {}
+ out: dict[str, str] = {}
+ with open(path, encoding="utf-8") as handle:
+ for raw in handle:
+ cid, _, inchi = raw.rstrip("\n").partition("\t")
+ if cid and inchi:
+ out[cid.strip()] = inchi.strip()
+ return out
+
+
+# --------------------------------------------------------------------------- #
+# KOs and organism genes
+# --------------------------------------------------------------------------- #
+@dataclass
+class KeggKO:
+ """A KEGG Orthology entry: its name and the organism genes assigned to it."""
+
+ id: str
+ name: str = ""
+ genes: list[tuple[str, str]] = field(default_factory=list) # (organism, gene)
+
+
+def parse_kegg_kos(kegg_dir: str | Path, *, keep: set[str] | None = None) -> list[KeggKO]:
+ """Parse ``/ko`` into :class:`KeggKO` records (name + organism genes).
+
+ ``keep`` limits parsing to those KO ids (e.g. only KOs linked to reactions),
+ mirroring RAVEN's ``koList`` argument — the gene lists are huge, so this is
+ the usual call.
+ """
+ ko_records: list[KeggKO] = []
+ for entry in _iter_entries(Path(kegg_dir) / "ko"):
+ ko_id = _first_id(entry.get("ENTRY", []))
+ if not ko_id or (keep is not None and ko_id not in keep):
+ continue
+ ko = KeggKO(id=ko_id)
+ if entry.get("DEFINITION"):
+ ko.name = entry["DEFINITION"][0].strip()
+ ko.genes = list(_parse_gene_lines(entry.get("GENES", [])))
+ ko_records.append(ko)
+ return ko_records
+
+
+def _parse_gene_lines(lines: list[str]) -> Iterator[tuple[str, str]]:
+ """Yield ``(organism, gene)`` pairs from a KO entry's GENES block.
+
+ Lines look like ``BSU: BSU31050(gbsB) BSU31060`` — an upper-case organism
+ code, a colon, then space-separated gene ids (with an optional ``(name)``
+ suffix that we strip). Organism codes are lower-cased to match KEGG's protein
+ sequence files (as RAVEN does).
+ """
+ for line in lines:
+ org, sep, rest = line.partition(":")
+ if not sep:
+ continue
+ organism = org.strip().lower()
+ for token in rest.split():
+ gene = token.split("(", 1)[0]
+ if gene:
+ yield organism, gene
+
+
+# --------------------------------------------------------------------------- #
+# Reference model + tables
+# --------------------------------------------------------------------------- #
+_COMPARTMENT = "s" # single 'system' compartment, as in getModelFromKEGG
+
+
+def build_reference_model(
+ reactions: list[KeggReaction], compounds: list[KeggCompound]
+) -> cobra.Model:
+ """Assemble the gene-free KEGG reference model from parsed records.
+
+ Only metabolites actually used by a reaction are added. Reactions carry KEGG
+ annotations (reaction id, KO ids, EC codes, pathways) but **no genes/GPRs**.
+ Bounds are ``(-1000, 1000)`` for reversible reactions and ``(0, 1000)``
+ otherwise.
+ """
+ model = cobra.Model("KEGG")
+ model.name = "Automatically generated from KEGG database"
+
+ by_id = {c.id: c for c in compounds}
+ # Reuse the cached parse from parse_kegg_reactions; only re-parse for
+ # callers that constructed KeggReaction records without the cache.
+ parsed = {
+ r.id: (r.stoichiometry if r.stoichiometry else _parse_equation(r.equation)[0])
+ for r in reactions
+ }
+ used = {m for stoich in parsed.values() for m in stoich}
+
+ metabolites = []
+ for cid in sorted(used):
+ cmp = by_id.get(cid)
+ met = cobra.Metabolite(cid, compartment=_COMPARTMENT)
+ if cmp:
+ met.name = cmp.name or cid
+ met.formula = cmp.formula or None
+ if cmp.chebi:
+ met.annotation["chebi"] = cmp.chebi
+ if cmp.pubchem:
+ met.annotation["pubchem.substance"] = cmp.pubchem
+ if cmp.inchi:
+ met.annotation["inchi"] = cmp.inchi
+ else:
+ met.name = cid
+ metabolites.append(met)
+ model.add_metabolites(metabolites)
+ met_index = {m.id: m for m in metabolites}
+
+ cobra_reactions = []
+ for rxn in reactions:
+ stoich = parsed[rxn.id]
+ if not stoich: # empty (e.g. A <=> A) -> skip, as RAVEN drops bad rxns
+ continue
+ reaction = cobra.Reaction(rxn.id, name=rxn.name)
+ reaction.bounds = (-1000.0, 1000.0) if rxn.reversible else (0.0, 1000.0)
+ reaction.add_metabolites({met_index[m]: c for m, c in stoich.items()})
+ reaction.annotation["kegg.reaction"] = rxn.id
+ if rxn.kos:
+ reaction.annotation["kegg.orthology"] = rxn.kos
+ if rxn.eccodes:
+ reaction.annotation["ec-code"] = rxn.eccodes
+ if rxn.pathways:
+ reaction.annotation["kegg.pathway"] = rxn.pathways
+ cobra_reactions.append(reaction)
+ model.add_reactions(cobra_reactions)
+ return model
+
+
+def build_kegg_tables(
+ reactions: list[KeggReaction], kos: list[KeggKO]
+) -> dict[str, pd.DataFrame]:
+ """Build the minimal relational tables from parsed records.
+
+ Returns a dict of ``DataFrame``s keyed by table name: ``ko_reaction``,
+ ``ko_names``, ``organism_gene_ko``, ``rxn_flags``.
+ """
+ ko_reaction = pd.DataFrame(
+ [(ko, r.id) for r in reactions for ko in r.kos],
+ columns=["ko", "reaction"],
+ ).drop_duplicates(ignore_index=True)
+
+ ko_names = pd.DataFrame(
+ [(ko.id, ko.name) for ko in kos], columns=["ko", "name"]
+ )
+
+ organism_gene_ko = pd.DataFrame(
+ [(org, gene, ko.id) for ko in kos for org, gene in ko.genes],
+ columns=["organism", "gene", "ko"],
+ ).drop_duplicates(ignore_index=True)
+
+ rxn_flags = pd.DataFrame(
+ [
+ (r.id, r.spontaneous, r.undefined_stoich, r.incomplete, r.general)
+ for r in reactions
+ ],
+ columns=["reaction", "spontaneous", "undefined_stoich", "incomplete", "general"],
+ )
+
+ return {
+ "ko_reaction": ko_reaction,
+ "ko_names": ko_names,
+ "organism_gene_ko": organism_gene_ko,
+ "rxn_flags": rxn_flags,
+ }
+
+
+def write_kegg_tables(tables: dict[str, pd.DataFrame], out_dir: str | Path) -> list[Path]:
+ """Write each table as a gzipped TSV (``.tsv.gz``) into ``out_dir``.
+
+ Gzipped TSV is the dependency-free cross-language format shared with MATLAB
+ RAVEN (see docs/kegg_data_format.md). Returns the written paths.
+ """
+ out_dir = Path(out_dir)
+ out_dir.mkdir(parents=True, exist_ok=True)
+ written = []
+ for name, frame in tables.items():
+ path = out_dir / f"{name}.tsv.gz"
+ with gzip.open(path, "wt", encoding="utf-8", newline="") as handle:
+ frame.to_csv(handle, sep="\t", index=False)
+ written.append(path)
+ return written
+
+
+def read_kegg_table(path: str | Path) -> pd.DataFrame:
+ """Read a KEGG table written by :func:`write_kegg_tables` or
+ :func:`stream_organism_gene_ko`.
+
+ Compression is inferred from the suffix, so both the gzipped small tables
+ (``.tsv.gz``) and the xz-compressed ``organism_gene_ko.tsv.xz`` are read
+ transparently.
+ """
+ return pd.read_csv(path, sep="\t", dtype=str, keep_default_na=False)
+
+
+def _flush_sorted_run(rows: list[str], tmp_dir: Path, run_no: int) -> Path:
+ """Sort a buffer of ``organism\\tgene\\tko\\n`` lines and write one gzipped run."""
+ rows.sort(key=_ogk_sort_key)
+ run_path = tmp_dir / f"run_{run_no:04d}.gz"
+ with gzip.open(run_path, "wt", encoding="utf-8", newline="") as run:
+ run.writelines(rows)
+ return run_path
+
+
+def _ogk_sort_key(line: str) -> tuple[str, str]:
+ """Sort key ``(organism, gene)`` for an ``organism\\tgene\\tko`` line."""
+ organism, gene, _ = line.split("\t", 2)
+ return organism, gene
+
+
+def stream_organism_gene_ko(
+ kegg_dir: str | Path, keep: set[str], ogk_path: str | Path, *, chunk_rows: int = 1_000_000
+) -> pd.DataFrame:
+ """Stream the ``ko`` file to a sorted, xz-compressed ``organism_gene_ko.tsv.xz``.
+
+ Real KEGG has ~9M gene↔KO associations — far too many to hold in memory as a
+ DataFrame. Rows are sorted by ``(organism, gene)`` before writing: gene IDs
+ from one organism share long common prefixes (locus tags, numeric runs), so
+ sorting makes them adjacent and lets the compressor shrink the table ~2.9x
+ versus the unsorted gzip form. The order also matches the by-organism query
+ pattern in :func:`get_kegg_model_for_organism`.
+
+ The sort is an **external merge sort** bounded to ``chunk_rows`` rows in
+ memory at a time (sorted runs spooled to gzipped temp files, then merged with
+ :func:`heapq.merge`), so peak memory stays flat regardless of KEGG size. Only
+ the small ``ko_names`` table (one row per KO) is held in full and returned.
+ """
+ ogk_path = Path(ogk_path)
+ names: list[tuple[str, str]] = []
+ buffer: list[str] = []
+ runs: list[Path] = []
+
+ with tempfile.TemporaryDirectory(prefix="ogk_sort_", dir=ogk_path.parent) as tmp:
+ tmp_dir = Path(tmp)
+ for entry in _iter_entries(Path(kegg_dir) / "ko"):
+ ko_id = _first_id(entry.get("ENTRY", []))
+ if not ko_id or ko_id not in keep:
+ continue
+ names.append((ko_id, entry["DEFINITION"][0].strip() if entry.get("DEFINITION") else ""))
+ for organism, gene in _parse_gene_lines(entry.get("GENES", [])):
+ buffer.append(f"{organism}\t{gene}\t{ko_id}\n")
+ if len(buffer) >= chunk_rows:
+ runs.append(_flush_sorted_run(buffer, tmp_dir, len(runs)))
+ buffer = []
+ if buffer:
+ runs.append(_flush_sorted_run(buffer, tmp_dir, len(runs)))
+
+ handles = [gzip.open(r, "rt", encoding="utf-8") for r in runs]
+ try:
+ with lzma.open(ogk_path, "wt", encoding="utf-8", newline="") as out:
+ out.write("organism\tgene\tko\n")
+ out.writelines(heapq.merge(*handles, key=_ogk_sort_key))
+ finally:
+ for h in handles:
+ h.close()
+ return pd.DataFrame(names, columns=["ko", "name"])
+
+
+def parse_kegg_dump(kegg_dir: str | Path, out_dir: str | Path) -> dict[str, Path]:
+ """Parse a full KEGG dump into the reference model + tables and write them out.
+
+ Writes ``reference_model.yml.gz`` (gzipped RAVEN/cobra YAML) plus the
+ gzipped-TSV tables into ``out_dir`` and returns ``{name: path}`` for
+ everything written. The large
+ ``organism_gene_ko`` table is streamed to disk (see
+ :func:`stream_organism_gene_ko`) rather than built in memory, so this scales
+ to the full KEGG database; the small derived tables are built in memory.
+ """
+ out_dir = Path(out_dir)
+ out_dir.mkdir(parents=True, exist_ok=True)
+
+ reactions = parse_kegg_reactions(kegg_dir)
+ compounds = parse_kegg_compounds(kegg_dir)
+ linked_kos = {ko for r in reactions for ko in r.kos}
+
+ model = build_reference_model(reactions, compounds)
+
+ small = {
+ "ko_reaction": pd.DataFrame(
+ [(ko, r.id) for r in reactions for ko in r.kos], columns=["ko", "reaction"]
+ ).drop_duplicates(ignore_index=True),
+ "rxn_flags": pd.DataFrame(
+ [(r.id, r.spontaneous, r.undefined_stoich, r.incomplete, r.general) for r in reactions],
+ columns=["reaction", "spontaneous", "undefined_stoich", "incomplete", "general"],
+ ),
+ }
+ paths = {name: p for name, p in zip(small, write_kegg_tables(small, out_dir), strict=True)}
+
+ ogk_path = out_dir / "organism_gene_ko.tsv.xz"
+ ko_names = stream_organism_gene_ko(kegg_dir, linked_kos, ogk_path)
+ paths["organism_gene_ko"] = ogk_path
+ paths.update(
+ zip(["ko_names"], write_kegg_tables({"ko_names": ko_names}, out_dir), strict=True)
+ )
+
+ ref_path = out_dir / "reference_model.yml.gz"
+ write_yaml_model(model, ref_path)
+ paths["reference_model"] = ref_path
+ return paths
diff --git a/src/raven_python/reconstruction/kegg/query.py b/src/raven_python/reconstruction/kegg/query.py
new file mode 100644
index 0000000..2df3f78
--- /dev/null
+++ b/src/raven_python/reconstruction/kegg/query.py
@@ -0,0 +1,231 @@
+"""De-novo KEGG draft from a proteome FASTA via HMM search (step 3b.5).
+
+Ports the FASTA/HMM branch of RAVEN ``getKEGGModelForOrganism``: search a query
+proteome against the KO profile-HMM library (3b.3), assign genes to KOs using the
+score cut-off and the two score-ratio filters, then build the draft model with the
+shared assembler. For organisms not in KEGG.
+
+Improvement over RAVEN: one ``hmmscan`` against the single ``hmmpress``-ed library
+(K7) replaces RAVEN's per-KO ``hmmsearch`` loop. Phylogenetic-distance subsampling
+is **not** used — our prebuilt prok90/euk90 libraries already fix the sequence set,
+so picking the right domain library (not per-organism distance weighting) is the
+relevant choice.
+
+The scoring/assignment logic (:func:`assign_kos`, :func:`parse_hmmscan_tblout`) is
+pure and unit-tested; running the search needs HMMER (``hmmscan``).
+"""
+from __future__ import annotations
+
+import math
+import subprocess
+import tempfile
+from pathlib import Path
+
+import cobra
+import pandas as pd
+
+from raven_python.binaries import resolve_binary
+from raven_python.io.yaml import read_yaml_model
+from raven_python.reconstruction.kegg.assemble import assemble_model_from_ko_genes
+from raven_python.reconstruction.kegg.parse import read_kegg_table
+
+_NOTE = "Included by get_kegg_model_from_sequences (using HMMs)"
+_MIN_EVALUE = 1e-250 # floor for a reported E-value of 0, to keep logs finite
+
+
+def run_hmmscan(
+ fasta: str | Path,
+ library: str | Path,
+ *,
+ threads: int = 1,
+ hmmscan: str | Path | None = None,
+) -> str:
+ """Run ``hmmscan`` of ``fasta`` against the pressed ``library``; return tblout text."""
+ exe = resolve_binary("hmmscan", binary=hmmscan)
+ with tempfile.TemporaryDirectory() as tmp:
+ tbl = Path(tmp) / "hits.tbl"
+ cmd = [exe, "--cpu", str(threads), "--tblout", str(tbl), str(library), str(fasta)]
+ proc = subprocess.run(cmd, capture_output=True, text=True)
+ if proc.returncode != 0:
+ raise RuntimeError(f"hmmscan failed:\n{(proc.stderr or '').strip()}")
+ return tbl.read_text()
+
+
+def parse_hmmscan_tblout(text: str) -> pd.DataFrame:
+ """Parse ``hmmscan --tblout`` text into a ``[ko, gene, evalue]`` table.
+
+ In ``hmmscan`` the HMM database is the *target*, so column 1 (target name) is
+ the KO, column 3 (query name) is the proteome gene, and column 5 is the
+ full-sequence E-value.
+ """
+ rows = []
+ for line in text.splitlines():
+ if not line or line.startswith("#"):
+ continue
+ fields = line.split()
+ if len(fields) < 5:
+ continue
+ rows.append((fields[0], fields[2], float(fields[4])))
+ return pd.DataFrame(rows, columns=["ko", "gene", "evalue"])
+
+
+def assign_kos(
+ hits: pd.DataFrame,
+ *,
+ cutoff: float = 1e-30,
+ min_score_ratio_ko: float = 0.3,
+ min_score_ratio_g: float = 0.9,
+) -> dict[str, list[str]]:
+ """Assign genes to KOs from HMM hits, applying the cut-off and ratio filters.
+
+ Ports RAVEN's three steps on the KO×gene E-value matrix:
+
+ 1. keep hits with ``evalue <= cutoff``;
+ 2. **min_score_ratio_ko** — within a KO, drop genes whose
+ ``log(evalue)/log(best_evalue_in_KO) < min_score_ratio_ko`` (prune weak
+ members of a KO);
+ 3. **min_score_ratio_g** — within a gene, drop KOs whose
+ ``log(evalue)/log(best_evalue_for_gene) < min_score_ratio_g`` (stop a gene
+ that clearly belongs to one KO leaking into weaker ones).
+
+ Smaller E-value = better; since all kept values are ``< 1`` their logs are
+ negative, so the best (smallest) hit gives ratio 1 and weaker hits give a
+ smaller positive ratio.
+
+ Default calibration (see IMPROVEMENTS K15). Cross-validated against the true
+ KEGG gene→KO annotation of four organisms spanning the prok/euk libraries and
+ the well-/lesser-studied axis (*S. cerevisiae*, *Cyanidioschyzon merolae*,
+ *E. coli*, *Mycoplasma genitalium*): real annotations score
+ overwhelmingly (median E ≈ 1e-100…1e-155) while spurious hits pile up at
+ ≈1e-8, so the two are separated by ~20 orders of magnitude. RAVEN's
+ ``1e-50`` sits inside the *true* tail and silently drops real but divergent
+ hits — costing 16% gene→KO recall on the divergent minimal genome
+ (*M. genitalium*) for no noise-rejection benefit (noise is far weaker). The
+ default is therefore loosened to **1e-30** (recovers that tail; still ~22
+ orders above the noise floor), with the precision work moved to
+ **min_score_ratio_g = 0.9** — the *effective* precision lever (it resolves
+ multi-KO genes). ``min_score_ratio_ko`` proved empirically inert across all
+ four organisms (identical output at 0.0/0.3/0.5) and is kept only for RAVEN
+ parity.
+ """
+ # The ratio filters compare log(evalue)/log(best_evalue); when best == 1.0
+ # the denominator is 0 → ZeroDivisionError. The default cutoff (1e-30) keeps
+ # us safely away, but a caller-passed cutoff ≥ 1 is ambiguous and would
+ # crash later. Reject it up front with a clear message.
+ if cutoff >= 1:
+ raise ValueError(
+ f"cutoff must be < 1 (smaller E-value = better hit); got {cutoff!r}."
+ )
+
+ # Best (smallest) E-value per (ko, gene), filtered at the cut-off.
+ mat: dict[str, dict[str, float]] = {}
+ for ko, gene, evalue in zip(hits["ko"], hits["gene"], hits["evalue"], strict=True):
+ if evalue > cutoff:
+ continue
+ e = evalue if evalue > 0 else _MIN_EVALUE
+ per_ko = mat.setdefault(ko, {})
+ if gene not in per_ko or e < per_ko[gene]:
+ per_ko[gene] = e
+
+ # Step 2: prune weak genes within each KO.
+ for ko, genes in mat.items():
+ log_best = math.log(min(genes.values()))
+ mat[ko] = {
+ g: e for g, e in genes.items() if math.log(e) / log_best >= min_score_ratio_ko
+ }
+
+ # Step 3: prune weak KOs within each gene (over the survivors of step 2).
+ gene_kos: dict[str, dict[str, float]] = {}
+ for ko, genes in mat.items():
+ for g, e in genes.items():
+ gene_kos.setdefault(g, {})[ko] = e
+ dropped: set[tuple[str, str]] = set()
+ for g, kos in gene_kos.items():
+ log_best = math.log(min(kos.values()))
+ for ko, e in kos.items():
+ if math.log(e) / log_best < min_score_ratio_g:
+ dropped.add((ko, g))
+
+ result: dict[str, list[str]] = {}
+ for ko, genes in mat.items():
+ kept = sorted(g for g in genes if (ko, g) not in dropped)
+ if kept:
+ result[ko] = kept
+ return result
+
+
+def get_kegg_model_from_sequences(
+ fasta: str | Path,
+ reference_model: cobra.Model,
+ ko_reaction: pd.DataFrame,
+ library: str | Path,
+ *,
+ rxn_flags: pd.DataFrame | None = None,
+ model_id: str | None = None,
+ cutoff: float = 1e-30,
+ min_score_ratio_ko: float = 0.3,
+ min_score_ratio_g: float = 0.9,
+ keep_spontaneous: bool = True,
+ keep_undefined_stoich: bool = True,
+ keep_incomplete: bool = True,
+ keep_general: bool = False,
+ threads: int = 1,
+ hmmscan: str | Path | None = None,
+) -> cobra.Model:
+ """Reconstruct a draft model for a proteome by HMM-searching the KO library.
+
+ Searches ``fasta`` against the pressed ``library`` (3b.3), assigns KOs
+ (:func:`assign_kos`), and assembles the model against ``reference_model`` /
+ ``ko_reaction``. Genes are the query proteome's identifiers.
+ """
+ hits = parse_hmmscan_tblout(run_hmmscan(fasta, library, threads=threads, hmmscan=hmmscan))
+ ko_to_genes = assign_kos(
+ hits,
+ cutoff=cutoff,
+ min_score_ratio_ko=min_score_ratio_ko,
+ min_score_ratio_g=min_score_ratio_g,
+ )
+ model, _ = assemble_model_from_ko_genes(
+ reference_model,
+ ko_reaction,
+ ko_to_genes,
+ rxn_flags=rxn_flags,
+ keep_spontaneous=keep_spontaneous,
+ keep_undefined_stoich=keep_undefined_stoich,
+ keep_incomplete=keep_incomplete,
+ keep_general=keep_general,
+ model_id=model_id,
+ note=_NOTE,
+ )
+ return model
+
+
+def get_kegg_model_from_sequences_with_artefacts(
+ fasta: str | Path,
+ artefact_dir: str | Path | None = None,
+ library: str | Path | None = None,
+ *,
+ domain: str = "prokaryotes",
+ version: str | None = None,
+ **kwargs,
+) -> cobra.Model:
+ """Load reference model + tables from ``artefact_dir`` and run the HMM query.
+
+ If ``artefact_dir`` / ``library`` are ``None`` they are fetched/cached via
+ :func:`raven_python.data.ensure_kegg_data` / :func:`raven_python.data.ensure_kegg_hmm_library`
+ (``domain`` selects the prok/euk library; ``version`` the release).
+ """
+ if artefact_dir is None or library is None:
+ from raven_python.data import ensure_kegg_data, ensure_kegg_hmm_library
+
+ if artefact_dir is None:
+ artefact_dir = ensure_kegg_data(version=version)
+ if library is None:
+ library = ensure_kegg_hmm_library(domain, version=version)
+ artefact_dir = Path(artefact_dir)
+ reference_model = read_yaml_model(artefact_dir / "reference_model.yml.gz")
+ ko_reaction = read_kegg_table(artefact_dir / "ko_reaction.tsv.gz")
+ rxn_flags = read_kegg_table(artefact_dir / "rxn_flags.tsv.gz")
+ return get_kegg_model_from_sequences(
+ fasta, reference_model, ko_reaction, library, rxn_flags=rxn_flags, **kwargs
+ )
diff --git a/src/raven_python/reconstruction/kegg/taxonomy.py b/src/raven_python/reconstruction/kegg/taxonomy.py
new file mode 100644
index 0000000..463fcce
--- /dev/null
+++ b/src/raven_python/reconstruction/kegg/taxonomy.py
@@ -0,0 +1,71 @@
+"""Parse the KEGG ``taxonomy`` file into per-organism category lineages.
+
+Ports the file-reading half of RAVEN ``getPhylDist`` (the distance-matrix half is
+step 3b.5). The ``taxonomy`` file is an indented tree: ``#``-prefixed lines name a
+category, the number of leading ``#`` giving its depth; organism lines are
+tab-separated ``T-numberorg_idname...``. Each organism inherits the
+stack of categories above it, the first of which is its domain (``Prokaryotes`` /
+``Eukaryotes``).
+
+Used by 3b.3 to split genes into the prok/euk HMM libraries, and (later) by 3b.5
+for phylogenetic distances.
+"""
+from __future__ import annotations
+
+import warnings
+from pathlib import Path
+
+
+def parse_taxonomy(path: str | Path) -> dict[str, list[str]]:
+ """Return ``{organism_id: [category, ...]}`` from outermost to innermost."""
+ org_categories: dict[str, list[str]] = {}
+ stack: list[str] = []
+ skipped_level_warned = False
+ with open(path, encoding="utf-8") as handle:
+ for line_no, raw in enumerate(handle, start=1):
+ line = raw.rstrip("\n")
+ if not line.strip():
+ continue
+ if line.startswith("#"):
+ depth = len(line) - len(line.lstrip("#"))
+ name = line[depth:].strip()
+ if depth - 1 > len(stack):
+ # Depth-skip (e.g. ## then ####): the original `stack[:depth-1]`
+ # silently produced a too-short lineage. Pad with explicit
+ # blanks so downstream slices stay aligned; warn once.
+ if not skipped_level_warned:
+ warnings.warn(
+ f"{path}: taxonomy depth skips a level near line {line_no} "
+ f"({'#' * depth} {name!r} appeared with stack {stack!r}); "
+ "padding the missing levels with '' (later occurrences silenced).",
+ stacklevel=2,
+ )
+ skipped_level_warned = True
+ stack = stack + [""] * (depth - 1 - len(stack))
+ else:
+ stack = stack[: depth - 1]
+ stack.append(name)
+ else:
+ fields = line.split("\t") if "\t" in line else line.split()
+ if len(fields) < 2:
+ continue
+ org_categories[fields[1].strip()] = list(stack)
+ return org_categories
+
+
+def organism_domains(path: str | Path) -> dict[str, str]:
+ """Return ``{organism_id: domain}`` (the outermost category)."""
+ return {org: cats[0] for org, cats in parse_taxonomy(path).items() if cats}
+
+
+def organisms_in_domain(path: str | Path, domain: str) -> set[str]:
+ """Organism ids whose outermost category matches ``domain`` (case-insensitive).
+
+ Accepts a prefix, so ``"prok"`` matches ``"Prokaryotes"``.
+ """
+ needle = domain.lower()
+ return {
+ org
+ for org, dom in organism_domains(path).items()
+ if dom.lower().startswith(needle) or needle.startswith(dom.lower())
+ }
diff --git a/src/raven_python/tasks/__init__.py b/src/raven_python/tasks/__init__.py
new file mode 100644
index 0000000..d232c16
--- /dev/null
+++ b/src/raven_python/tasks/__init__.py
@@ -0,0 +1,23 @@
+"""Metabolic task definition, parsing, and checking.
+
+* :class:`Task` + :func:`parse_task_list` — the task-list file format.
+* :func:`check_tasks` + :class:`TaskResult` — run tasks against a model.
+* :func:`find_task_essential_reactions` + :class:`EssentialReactionsResult` — reactions
+ a model must use to satisfy a task list (the input for (f)tINIT's task layer).
+"""
+from raven_python.tasks.check import (
+ EssentialReactionsResult,
+ TaskResult,
+ check_tasks,
+ find_task_essential_reactions,
+)
+from raven_python.tasks.tasklist import Task, parse_task_list
+
+__all__ = [
+ "EssentialReactionsResult",
+ "Task",
+ "TaskResult",
+ "check_tasks",
+ "find_task_essential_reactions",
+ "parse_task_list",
+]
diff --git a/src/raven_python/tasks/check.py b/src/raven_python/tasks/check.py
new file mode 100644
index 0000000..817bae5
--- /dev/null
+++ b/src/raven_python/tasks/check.py
@@ -0,0 +1,332 @@
+"""Check whether a model performs a set of metabolic tasks.
+
+For each task the model is constrained by the task's allowed inputs/outputs (and any
+extra reactions / bound changes), then tested for feasibility: a task *passes* if a
+steady-state flux exists, unless it is marked ``should_fail`` (then it passes iff
+infeasible).
+
+Inputs/outputs are encoded as ranges on the per-metabolite mass-balance constraint
+(``model.constraints[met.id]``): an input allows net consumption (``Sv ∈ [-UB, -LB]``)
+and an output allows / requires net production (``Sv ≤ UB``, and ``≥ LB`` if
+``LB > 0``). Existing boundary reactions are closed first, so inputs/outputs are
+defined solely by the task (closed-model semantics).
+"""
+from __future__ import annotations
+
+import pickle
+from collections.abc import Iterable
+from dataclasses import dataclass
+from pathlib import Path
+
+import cobra
+from cobra.exceptions import OptimizationError
+from cobra.flux_analysis import flux_variability_analysis, pfba
+from optlang.symbolics import Zero
+
+from raven_python.manipulation.add import add_reactions_from_equations
+from raven_python.tasks.tasklist import Task, parse_task_list
+
+_ALLMETS = "ALLMETS"
+_ALLMETSIN = "ALLMETSIN"
+
+
+@dataclass
+class TaskResult:
+ """Result of one task: ``passed`` is the verdict (accounts for ``should_fail``)."""
+
+ id: str
+ description: str
+ passed: bool
+ feasible: bool
+ error: str | None = None
+
+
+def _set_constraint_bounds(constraint, lb: float, ub: float) -> None:
+ """Set an optlang constraint's bounds without a transient lb > ub."""
+ if lb > constraint.ub:
+ constraint.ub = ub
+ constraint.lb = lb
+ else:
+ constraint.lb = lb
+ constraint.ub = ub
+
+
+def _classify(token: str) -> tuple[str, str | None]:
+ """Return ``("all", None)``, ``("comp", COMP)``, or ``("met", token_upper)``."""
+ upper = token.upper()
+ if upper == _ALLMETS:
+ return "all", None
+ if upper.startswith(_ALLMETSIN + "[") and upper.endswith("]"):
+ return "comp", upper[len(_ALLMETSIN) + 1: -1]
+ return "met", upper # incl. malformed ALLMETSIN[... → treated as a (missing) metabolite
+
+
+def _metabolite_bounds(
+ task: Task, name_to_ids: dict[str, list[str]], comp_to_ids: dict[str, list[str]]
+) -> tuple[dict[str, list[float]], list[str]]:
+ """Compute ``{met_id: [lb, ub]}`` from a task's inputs/outputs (RAVEN ``b``).
+
+ Bulk tokens (ALLMETS / ALLMETSIN) are applied before specific metabolites, as
+ RAVEN does. Returns the bounds and a list of unresolved tokens (→ task error).
+ """
+ bounds: dict[str, list[float]] = {}
+ missing: list[str] = []
+
+ def touch(mid: str) -> list[float]:
+ return bounds.setdefault(mid, [0.0, 0.0])
+
+ for entries, is_input in ((task.inputs, True), (task.outputs, False)):
+ bulk = [(t, lb, ub) for (t, lb, ub) in entries if _classify(t)[0] != "met"]
+ specific = [(t, lb, ub) for (t, lb, ub) in entries if _classify(t)[0] == "met"]
+ for token, lb, ub in bulk + specific:
+ kind, arg = _classify(token)
+ if kind == "all":
+ ids = [mid for group in comp_to_ids.values() for mid in group]
+ elif kind == "comp":
+ ids = comp_to_ids.get(arg, [])
+ else:
+ ids = name_to_ids.get(arg, [])
+ if not ids:
+ missing.append(token)
+ continue
+ for mid in ids:
+ b = touch(mid)
+ if is_input:
+ b[0] = -ub # allow net consumption up to UB (RAVEN b1 = -UBin)
+ if kind == "met":
+ b[1] = -lb
+ else:
+ b[1] = ub # allow net production up to UB
+ if kind == "met" and lb > 0:
+ b[0] = lb # require at least LB produced
+ return bounds, missing
+
+
+def task_name_maps(model: cobra.Model) -> tuple[dict[str, list[str]], dict[str, list[str]]]:
+ """Build ``name[comp]→[ids]`` and ``comp→[ids]`` lookups for a model's metabolites.
+
+ ``name[comp]`` maps to a *list* because a model can carry several metabolites with
+ the same name and compartment; a task referencing it constrains all of them (as
+ RAVEN does), rather than an arbitrary one.
+ """
+ name_to_ids: dict[str, list[str]] = {}
+ comp_to_ids: dict[str, list[str]] = {}
+ for m in model.metabolites:
+ name_to_ids.setdefault(f"{m.name}[{m.compartment}]".upper(), []).append(m.id)
+ comp_to_ids.setdefault((m.compartment or "").upper(), []).append(m.id)
+ return name_to_ids, comp_to_ids
+
+
+def apply_task_constraints(
+ model: cobra.Model, task: Task, name_to_id, comp_to_ids
+) -> tuple[set[str], str | None]:
+ """Apply a task's inputs/outputs/equations/bound-changes to ``model`` in place.
+
+ Sets a feasibility (zero) objective. Returns ``(task_metabolite_ids, error)``;
+ ``task_metabolite_ids`` are the model metabolites the task references (RAVEN's
+ ``essentialMetsForTasks``). On error the model may be partially modified.
+ """
+ bounds, missing = _metabolite_bounds(task, name_to_id, comp_to_ids)
+ if missing:
+ return set(), f"unknown metabolite(s): {sorted(set(missing))}"
+ task_mets = {mid for mid in bounds}
+ for mid, (lb, ub) in bounds.items():
+ if (lb, ub) != (0.0, 0.0):
+ _set_constraint_bounds(model.constraints[mid], lb, ub)
+
+ if task.equations:
+ existing = {m.id for m in model.metabolites}
+ specs = [
+ {"id": f"TASK_TMP_{i}", "equation": equ, "bounds": (lb, ub)}
+ for i, (equ, lb, ub) in enumerate(task.equations)
+ ]
+ add_reactions_from_equations(model, specs, mets_by="name", allow_new_mets=True)
+ for i in range(len(specs)):
+ tmp = model.reactions.get_by_id(f"TASK_TMP_{i}")
+ task_mets |= {m.id for m in tmp.metabolites if m.id in existing}
+
+ for rxn_id, lb, ub in task.changed:
+ if rxn_id not in model.reactions:
+ return set(), f"CHANGED RXN not in model: {rxn_id!r}"
+ model.reactions.get_by_id(rxn_id).bounds = (lb, ub)
+
+ model.objective = model.problem.Objective(Zero, direction="max") # feasibility only
+ return task_mets, None
+
+
+def _build_task_model(
+ base: cobra.Model, task: Task, name_to_id, comp_to_ids
+) -> tuple[cobra.Model | None, set[str], str | None]:
+ """Copy ``base`` and apply a task's constraints (``model``/``error`` exclusive)."""
+ model = base.copy()
+ task_mets, error = apply_task_constraints(model, task, name_to_id, comp_to_ids)
+ return (None if error else model), task_mets, error
+
+
+def _run_task(base: cobra.Model, task: Task, name_to_id, comp_to_ids) -> TaskResult:
+ """Test one task by applying its constraints to ``base`` in place, then reverting.
+
+ Avoids copying the (genome-scale) model per task — the copy dominates ``check_tasks``
+ runtime. ``with base:`` reverts everything ``apply_task_constraints`` does through
+ cobra's API (temp reactions/metabolites for equations, reaction bounds, objective);
+ the one untracked change — direct metabolite mass-balance (``model.constraints[mid]``)
+ bound edits — is snapshotted and restored explicitly. Net result is identical to the
+ copy-based version but reuses a single model across all tasks.
+ """
+ bounds, missing = _metabolite_bounds(task, name_to_id, comp_to_ids)
+ if missing:
+ return TaskResult(task.id, task.description, False, False,
+ f"unknown metabolite(s): {sorted(set(missing))}")
+ saved = {mid: (base.constraints[mid].lb, base.constraints[mid].ub) for mid in bounds}
+ try:
+ with base: # reverts temp reactions/mets, reaction bounds, objective on exit
+ _, error = apply_task_constraints(base, task, name_to_id, comp_to_ids)
+ if error is not None:
+ return TaskResult(task.id, task.description, False, False, error)
+ base.slim_optimize()
+ feasible = base.solver.status == "optimal"
+ finally: # restore the untracked metabolite-constraint bound edits
+ for mid, (lb, ub) in saved.items():
+ _set_constraint_bounds(base.constraints[mid], lb, ub)
+ return TaskResult(task.id, task.description, feasible != task.should_fail, feasible)
+
+
+def check_tasks(
+ model: cobra.Model,
+ tasks: str | Iterable[Task],
+ *,
+ close_boundaries: bool = True,
+) -> list[TaskResult]:
+ """Run a task list against ``model`` and return a :class:`TaskResult` per task.
+
+ ``tasks`` is a parsed list of :class:`Task` or a path to a task-list file. With
+ ``close_boundaries`` (default), existing exchange/sink/demand reactions are
+ closed so inputs/outputs are defined purely by the tasks (as RAVEN assumes).
+ """
+ tasks = _as_tasks(tasks)
+ base, name_to_id, comp_to_ids = _prepare_base(model, close_boundaries)
+ return [_run_task(base, task, name_to_id, comp_to_ids) for task in tasks]
+
+
+def _as_tasks(tasks: str | Iterable[Task]) -> list[Task]:
+ if isinstance(tasks, (str, bytes)) or hasattr(tasks, "__fspath__"):
+ return parse_task_list(tasks)
+ return list(tasks)
+
+
+def _prepare_base(model: cobra.Model, close_boundaries: bool):
+ base = model.copy()
+ if close_boundaries:
+ for rxn in base.boundary:
+ rxn.bounds = (0.0, 0.0)
+ name_to_id, comp_to_ids = task_name_maps(base)
+ return base, name_to_id, comp_to_ids
+
+
+@dataclass
+class EssentialReactionsResult:
+ """Reactions a model *must* use to perform a task list (RAVEN ``essentialRxns``).
+
+ ``reactions`` maps reaction id → forced flux direction (``+1`` forward, ``-1``
+ reverse): the reaction must carry flux of that sign in every feasible solution of
+ at least one task. ``per_task`` is the same, split by task id. ``task_metabolites``
+ are the model metabolites the tasks reference (RAVEN ``essentialMetsForTasks``,
+ protected from removal). ``failed_tasks`` are tasks that were infeasible or
+ malformed and thus skipped (RAVEN drops these from the task list).
+ """
+
+ reactions: dict[str, int]
+ per_task: dict[str, dict[str, int]]
+ task_metabolites: set[str]
+ failed_tasks: list[str]
+
+
+def _task_essential_reactions(
+ task_model: cobra.Model, candidates: list[str], tol: float
+) -> dict[str, int]:
+ """Reactions in ``candidates`` forced to carry flux, with direction, via FVA.
+
+ A reaction is *essential* for the task iff zero is not attainable in any feasible
+ solution — i.e. its FVA range excludes 0. This is exactly RAVEN's
+ "constrain to 0 → infeasible" definition, but obtained from FVA ranges (no
+ per-reaction knockout loop). The nonzero side of the range gives the forced
+ direction. FVA is restricted to ``candidates`` — the reactions carrying flux in a
+ minimal feasible solution, the only ones that *can* be essential (an essential
+ reaction is nonzero in every feasible solution, so also in that one) — which keeps
+ this cheap on genome-scale templates instead of ranging all reactions.
+ """
+ if not candidates:
+ return {}
+ fva = flux_variability_analysis(task_model, reaction_list=candidates, fraction_of_optimum=0.0)
+ essential: dict[str, int] = {}
+ for rxn_id, lo, hi in zip(fva.index, fva["minimum"], fva["maximum"], strict=True):
+ if lo > tol:
+ essential[rxn_id] = 1
+ elif hi < -tol:
+ essential[rxn_id] = -1
+ return essential
+
+
+def find_task_essential_reactions(
+ model: cobra.Model,
+ tasks: str | Iterable[Task],
+ *,
+ close_boundaries: bool = True,
+ tol: float = 1e-8,
+ cache_path: str | Path | None = None,
+) -> EssentialReactionsResult:
+ """Find the reactions a model must use to satisfy a task list.
+
+ For each task the model is constrained as in :func:`check_tasks`, then FVA
+ identifies reactions whose flux can never be zero (essential) and their forced
+ direction. This is the ``prepINITModel`` step that feeds (ft)INIT: essential
+ reactions are kept regardless of expression score and made irreversible in their
+ forced direction. When a reaction is essential in several tasks with conflicting
+ directions, the majority wins (ties → forward), matching RAVEN's ``pos < neg``.
+
+ On a genome-scale model this is slow (an FVA per task). Pass ``cache_path`` to make
+ it **resumable**: each task's result is written there as it completes (atomically),
+ and a re-run skips tasks already cached — so it survives interruptions and finishes
+ across several sessions.
+ """
+ tasks = _as_tasks(tasks)
+ base, name_to_id, comp_to_ids = _prepare_base(model, close_boundaries)
+ original_ids = {r.id for r in base.reactions}
+
+ per_task: dict[str, dict[str, int]] = {}
+ task_metabolites: set[str] = set()
+ failed: list[str] = []
+ if cache_path is not None and Path(cache_path).exists():
+ cached = pickle.load(open(cache_path, "rb"))
+ per_task, task_metabolites, failed = cached["per_task"], set(cached["mets"]), list(cached["failed"])
+
+ done = set(per_task) | set(failed)
+ for task in tasks:
+ if task.should_fail or task.id in done:
+ continue # a should-fail task defines no essentials; cached ones are skipped
+ task_model, task_mets, error = _build_task_model(base, task, name_to_id, comp_to_ids)
+ if error is not None:
+ failed.append(task.id)
+ else:
+ # One min-flux solve both proves feasibility and yields the essential-reaction
+ # candidates (the original reactions carrying flux in a sparse solution).
+ try:
+ fluxes = pfba(task_model).fluxes
+ candidates = [rid for rid in original_ids if abs(fluxes.get(rid, 0.0)) > tol]
+ task_metabolites |= task_mets
+ per_task[task.id] = _task_essential_reactions(task_model, candidates, tol)
+ except OptimizationError:
+ failed.append(task.id)
+ if cache_path is not None: # atomic checkpoint after each task
+ tmp = Path(f"{cache_path}.part")
+ pickle.dump({"per_task": per_task, "mets": task_metabolites, "failed": failed},
+ open(tmp, "wb"))
+ tmp.replace(cache_path)
+
+ # Majority direction; tie (sum == 0) → forward, as RAVEN's `pos < neg`.
+ direction_votes: dict[str, int] = {}
+ for essential in per_task.values():
+ for rxn_id, direction in essential.items():
+ direction_votes[rxn_id] = direction_votes.get(rxn_id, 0) + direction
+ reactions = {rid: (-1 if votes < 0 else 1) for rid, votes in direction_votes.items()}
+ return EssentialReactionsResult(reactions, per_task, task_metabolites, failed)
diff --git a/src/raven_python/tasks/tasklist.py b/src/raven_python/tasks/tasklist.py
new file mode 100644
index 0000000..5bdbcb0
--- /dev/null
+++ b/src/raven_python/tasks/tasklist.py
@@ -0,0 +1,141 @@
+"""Parse a metabolic task list.
+
+A task list defines, per task, allowed inputs/outputs, optional extra reactions
+(equations), reaction-bound changes, and whether the task *should fail*. Tasks
+are checked with :func:`raven_python.tasks.check_tasks`.
+
+The file is tab-delimited (``.txt``/``.tsv``) or Excel (``.xlsx``, sheet ``TASKS``;
+needs the ``[excel]`` extra). Recognised columns (the only required one is ``ID``):
+
+ ID · DESCRIPTION · IN · IN LB · IN UB · OUT · OUT LB · OUT UB ·
+ EQU · EQU LB · EQU UB · CHANGED RXN · CHANGED LB · CHANGED UB ·
+ SHOULD FAIL · PRINT FLUX · COMMENTS
+
+A task spans consecutive rows; only its first row carries an ID. Metabolites are
+written ``name[compartment]``; several in one cell are separated by ``;`` (sharing
+that row's bounds). ``IN``/``OUT`` default LB 0, UB 1000; ``EQU`` defaults LB
+-1000 if reversible (``<=>``) else 0, UB 1000. The special tokens ``ALLMETS`` and
+``ALLMETSIN[comp]`` allow free uptake/excretion of all metabolites (only the upper
+bound is used).
+"""
+from __future__ import annotations
+
+import csv
+import warnings
+from dataclasses import dataclass, field
+from pathlib import Path
+
+_COLUMNS = (
+ "ID", "DESCRIPTION", "IN", "IN LB", "IN UB", "OUT", "OUT LB", "OUT UB",
+ "EQU", "EQU LB", "EQU UB", "CHANGED RXN", "CHANGED LB", "CHANGED UB",
+ "SHOULD FAIL", "PRINT FLUX", "COMMENTS",
+)
+
+
+@dataclass
+class Task:
+ """One metabolic task. Bounds are ``(metabolite_or_reaction, lb, ub)`` triples."""
+
+ id: str
+ description: str = ""
+ should_fail: bool = False
+ print_fluxes: bool = False
+ comments: str = ""
+ inputs: list[tuple[str, float, float]] = field(default_factory=list)
+ outputs: list[tuple[str, float, float]] = field(default_factory=list)
+ equations: list[tuple[str, float, float]] = field(default_factory=list)
+ changed: list[tuple[str, float, float]] = field(default_factory=list)
+
+
+def _truthy(value: str) -> bool:
+ return value.strip().lower() not in ("", "0", "false", "no")
+
+
+def _num(value: str, default: float) -> float:
+ value = value.strip()
+ return float(value) if value else default
+
+
+def _read_rows(path: str | Path) -> list[list[str]]:
+ path = Path(path)
+ if path.suffix.lower() in (".xlsx", ".xlsm"):
+ try:
+ from openpyxl import load_workbook
+ except ImportError as exc: # pragma: no cover - optional dep
+ raise ImportError("Reading .xlsx task lists needs the '[excel]' extra (openpyxl).") from exc
+ wb = load_workbook(path, data_only=True)
+ if "TASKS" not in wb.sheetnames:
+ raise ValueError(
+ f"{path}: workbook has no sheet named 'TASKS' "
+ f"(found: {wb.sheetnames}). Rename the sheet or pick that file."
+ )
+ ws = wb["TASKS"]
+ return [["" if c is None else str(c) for c in row] for row in ws.iter_rows(values_only=True)]
+ with open(path, encoding="utf-8", newline="") as handle:
+ return [row for row in csv.reader(handle, delimiter="\t")]
+
+
+def parse_task_list(path: str | Path) -> list[Task]:
+ """Parse a task-list file into :class:`Task` objects."""
+ rows = _read_rows(path)
+ header_idx = next(
+ (i for i, r in enumerate(rows) if any(c.strip().upper() == "ID" for c in r)), None
+ )
+ if header_idx is None:
+ raise ValueError(f"{path}: no header row with an 'ID' column found.")
+ header = [c.strip().upper() for c in rows[header_idx]]
+ col = {name: header.index(name) for name in _COLUMNS if name in header}
+
+ def cell(row: list[str], name: str) -> str:
+ i = col.get(name)
+ return row[i].strip() if i is not None and i < len(row) else ""
+
+ # Columns whose presence on an ID-less row signals real continuation data
+ # (vs. pure whitespace/comment), used by the orphan-row warning below.
+ _DATA_COLS = ("IN", "OUT", "EQU", "CHANGED RXN")
+
+ tasks: list[Task] = []
+ current: Task | None = None
+ for row_no, row in enumerate(rows[header_idx + 1:], start=header_idx + 2):
+ if not any(c.strip() for c in row):
+ continue
+ rid = cell(row, "ID")
+ if rid.startswith("#"):
+ continue
+ if rid:
+ current = Task(
+ id=rid,
+ description=cell(row, "DESCRIPTION"),
+ should_fail=_truthy(cell(row, "SHOULD FAIL")),
+ print_fluxes=_truthy(cell(row, "PRINT FLUX")),
+ comments=cell(row, "COMMENTS"),
+ )
+ tasks.append(current)
+ if current is None:
+ # Continuation row appearing before any task ID: silently dropping it
+ # used to mask malformed task files. Warn (and skip) so the user sees it.
+ if any(cell(row, c) for c in _DATA_COLS):
+ warnings.warn(
+ f"{path}: row {row_no} carries task data but no task ID has "
+ "been seen yet; the row is being skipped.",
+ stacklevel=2,
+ )
+ continue
+ _add_row(current, row, cell)
+ return tasks
+
+
+def _add_row(task: Task, row: list[str], cell) -> None:
+ if inp := cell(row, "IN"):
+ lb, ub = _num(cell(row, "IN LB"), 0.0), _num(cell(row, "IN UB"), 1000.0)
+ task.inputs += [(m.strip(), lb, ub) for m in inp.split(";") if m.strip()]
+ if out := cell(row, "OUT"):
+ lb, ub = _num(cell(row, "OUT LB"), 0.0), _num(cell(row, "OUT UB"), 1000.0)
+ task.outputs += [(m.strip(), lb, ub) for m in out.split(";") if m.strip()]
+ if equ := cell(row, "EQU"):
+ lb = _num(cell(row, "EQU LB"), -1000.0 if "<=>" in equ else 0.0)
+ ub = _num(cell(row, "EQU UB"), 1000.0)
+ task.equations.append((equ.strip(), lb, ub))
+ if chg := cell(row, "CHANGED RXN"):
+ lb, ub = _num(cell(row, "CHANGED LB"), -1000.0), _num(cell(row, "CHANGED UB"), 1000.0)
+ task.changed += [(r.strip(), lb, ub) for r in chg.split(";") if r.strip()]
diff --git a/src/raven_python/utils/__init__.py b/src/raven_python/utils/__init__.py
new file mode 100644
index 0000000..7127bdd
--- /dev/null
+++ b/src/raven_python/utils/__init__.py
@@ -0,0 +1,16 @@
+"""Shared helpers — GPR linting, elemental balance, model curation checks, id sorting."""
+from raven_python.utils.balance import ElementalBalance, get_elemental_balance
+from raven_python.utils.gpr import GPRIssue, find_non_dnf_grrules, is_dnf
+from raven_python.utils.sort import sort_identifiers
+from raven_python.utils.validate import ModelIssue, check_model
+
+__all__ = [
+ "ElementalBalance",
+ "GPRIssue",
+ "ModelIssue",
+ "check_model",
+ "find_non_dnf_grrules",
+ "get_elemental_balance",
+ "is_dnf",
+ "sort_identifiers",
+]
diff --git a/src/raven_python/utils/balance.py b/src/raven_python/utils/balance.py
new file mode 100644
index 0000000..ee64ab4
--- /dev/null
+++ b/src/raven_python/utils/balance.py
@@ -0,0 +1,89 @@
+"""Check the elemental balance of reactions, distinguishing *unbalanced* from
+*unknown* (missing formula).
+
+cobra's ``reaction.check_mass_balance()`` silently treats a missing formula as
+empty, so a reaction can look "unbalanced" — or even balanced — when the truth is
+that the data is incomplete. This module checks for missing formulas first and
+returns a graded status
+per reaction (``balanced`` / ``unbalanced`` / ``unknown``) plus the element
+imbalance — over a batch, as structured data.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+import cobra
+
+
+@dataclass(frozen=True)
+class ElementalBalance:
+ """Balance result for one reaction.
+
+ Attributes
+ ----------
+ reaction_id
+ ID of the reaction.
+ status
+ ``"balanced"`` — elements balance;
+ ``"unbalanced"`` — they do not (see ``imbalance``);
+ ``"unknown"`` — at least one metabolite has no formula, so it cannot be
+ determined (cobra would silently miscount these).
+ imbalance
+ Element → net coefficient (products − reactants), only for
+ ``"unbalanced"``; empty otherwise. Charge is not included.
+ """
+
+ reaction_id: str
+ status: str
+ imbalance: dict[str, float] = field(default_factory=dict)
+
+
+def get_elemental_balance(
+ model: cobra.Model, reactions=None
+) -> list[ElementalBalance]:
+ """Check whether reactions are elementally balanced.
+ Parameters
+ ----------
+ reactions
+ Reaction IDs/objects to check; default all reactions. (Boundary
+ reactions exchange mass with the environment and will read as
+ ``unbalanced`` — filter them out if that is not wanted.)
+
+ Returns
+ -------
+ list of ElementalBalance
+ One entry per checked reaction, in model order.
+ """
+ if reactions is None:
+ rxns = list(model.reactions)
+ else:
+ if isinstance(reactions, (str, cobra.Reaction)):
+ reactions = [reactions]
+ rxns = [
+ r if isinstance(r, cobra.Reaction) else model.reactions.get_by_id(r)
+ for r in reactions
+ ]
+
+ results: list[ElementalBalance] = []
+ for rxn in rxns:
+ if not rxn.metabolites:
+ # A reaction with no metabolites used to fall through to ``balanced``
+ # (vacuously) because ``any()`` over the empty list is False and the
+ # zero-element imbalance dict is empty. Treat the no-formula case
+ # (zero formulae present) as ``unknown``: we can't determine balance
+ # for a reaction without stoichiometry.
+ results.append(ElementalBalance(rxn.id, "unknown"))
+ continue
+ if any(not met.formula for met in rxn.metabolites):
+ results.append(ElementalBalance(rxn.id, "unknown"))
+ continue
+ imbalance = {
+ element: amount
+ for element, amount in rxn.check_mass_balance().items()
+ if element != "charge"
+ }
+ if imbalance:
+ results.append(ElementalBalance(rxn.id, "unbalanced", imbalance))
+ else:
+ results.append(ElementalBalance(rxn.id, "balanced"))
+ return results
diff --git a/src/raven_python/utils/gpr.py b/src/raven_python/utils/gpr.py
new file mode 100644
index 0000000..2e2122d
--- /dev/null
+++ b/src/raven_python/utils/gpr.py
@@ -0,0 +1,119 @@
+"""GPR (gene-protein-reaction rule) linting.
+
+Flag GPRs that are *not* in disjunctive normal form ("OR of AND-complexes"), via cobra's
+GPR AST. GPR syntax *normalisation* is already done by cobra on assignment, so it isn't
+re-implemented here.
+
+Part (2) has no cobrapy equivalent and is ported here, reworked onto cobra's
+GPR AST instead of RAVEN's brittle substring search. The relevant property is
+**disjunctive normal form (DNF)**: an OR of AND-clauses of single genes, e.g.
+``(G1 and G2) or G3``. Rules where an AND contains an OR — e.g.
+``(G1 or G2) and (G3 or G4)`` — are *valid* for cobra but ambiguous for the
+isoenzyme/complex reasoning used across RAVEN/GECKO, and ``expand_model``
+(see :mod:`raven_python.manipulation.expand`) only does something for DNF rules.
+:func:`find_non_dnf_grrules` surfaces them as structured data rather than, as
+RAVEN did, only printing a warning.
+"""
+from __future__ import annotations
+
+import ast
+from dataclasses import dataclass
+
+import cobra
+from cobra.core.gene import GPR
+
+
+def _contains_or(node: ast.AST | None) -> bool:
+ """True if ``node``'s subtree contains an OR operator anywhere."""
+ if isinstance(node, ast.BoolOp):
+ if isinstance(node.op, ast.Or):
+ return True
+ return any(_contains_or(value) for value in node.values)
+ return False
+
+
+def _is_dnf_node(node: ast.AST | None) -> bool:
+ """True if the AST rooted at ``node`` is in disjunctive normal form.
+
+ DNF here means no AND operator has an OR anywhere beneath it, i.e. the
+ rule is a single gene, a pure AND-complex, or an OR of those.
+ """
+ if node is None or isinstance(node, ast.Name):
+ return True
+ if isinstance(node, ast.BoolOp):
+ if isinstance(node.op, ast.And):
+ return not any(_contains_or(value) for value in node.values)
+ # OR: every disjunct must itself be DNF
+ return all(_is_dnf_node(value) for value in node.values)
+ # Unknown node type: don't flag it as a problem.
+ return True
+
+
+def is_dnf(gpr: GPR | str | None) -> bool:
+ """Return whether a GPR is in disjunctive normal form (OR of AND-complexes).
+
+ Parameters
+ ----------
+ gpr
+ A cobra :class:`~cobra.core.gene.GPR`, a grRule string, or ``None``.
+ An empty/``None`` rule is trivially DNF.
+
+ Examples
+ --------
+ >>> is_dnf("(G1 and G2) or G3")
+ True
+ >>> is_dnf("(G1 or G2) and G3")
+ False
+ """
+ if isinstance(gpr, str):
+ gpr = GPR.from_string(gpr)
+ if gpr is None:
+ return True
+ return _is_dnf_node(gpr.body)
+
+
+@dataclass(frozen=True)
+class GPRIssue:
+ """A reaction whose GPR is flagged by the linter.
+
+ Attributes
+ ----------
+ reaction_id
+ ID of the reaction.
+ gpr
+ The (already cobra-normalised) grRule string.
+ reason
+ Human-readable explanation of why it was flagged.
+ """
+
+ reaction_id: str
+ gpr: str
+ reason: str
+
+
+_NON_DNF_REASON = (
+ "GPR is not in disjunctive normal form (an AND clause contains an OR). "
+ "Isoenzyme/complex reasoning and expand_model assume an OR of AND-complexes, "
+ 'e.g. rewrite "(G1 or G2) and (G3 or G4)" as '
+ '"(G1 and G3) or (G1 and G4) or (G2 and G3) or (G2 and G4)".'
+)
+
+
+def find_non_dnf_grrules(model: cobra.Model) -> list[GPRIssue]:
+ """Find reactions whose GPR is not in disjunctive normal form ("OR of AND-complexes").
+
+ Uses cobra's GPR AST. Reactions with no GPR are skipped.
+
+ Returns
+ -------
+ list of GPRIssue
+ One entry per flagged reaction, in model reaction order. Empty if all
+ GPRs are simple OR-of-AND-complexes.
+ """
+ issues: list[GPRIssue] = []
+ for rxn in model.reactions:
+ if not rxn.gene_reaction_rule:
+ continue
+ if not is_dnf(rxn.gpr):
+ issues.append(GPRIssue(rxn.id, rxn.gene_reaction_rule, _NON_DNF_REASON))
+ return issues
diff --git a/src/raven_python/utils/parse.py b/src/raven_python/utils/parse.py
new file mode 100644
index 0000000..8068f6c
--- /dev/null
+++ b/src/raven_python/utils/parse.py
@@ -0,0 +1,33 @@
+"""Small parsing helpers shared across raven_python."""
+from __future__ import annotations
+
+import re
+
+# A metabolite written as ``name[comp]``. The name is greedy so that, for a
+# pathological name that itself contains brackets, the *last* ``[...]`` is taken
+# as the compartment (matching RAVEN getIndexes' ``max(strfind('['))`` rule).
+_NAME_COMP_RE = re.compile(r"^(?P.+)\[(?P[^\[\]]+)\]$")
+
+
+def parse_name_comp(token: str) -> tuple[str, str | None]:
+ """Split a ``name[comp]`` token into ``(name, compartment)``.
+
+ This is the one genuinely cobra-absent sliver of RAVEN ``getIndexes``'
+ ``metcomps`` mode and ``addRxns`` eqnType 3: resolving a metabolite written
+ as its *name* plus a compartment in square brackets, e.g. ``"ATP[c]"``.
+
+ Returns ``(name, None)`` when there is no trailing ``[...]``.
+
+ Examples
+ --------
+ >>> parse_name_comp("ATP[c]")
+ ('ATP', 'c')
+ >>> parse_name_comp("ATP")
+ ('ATP', None)
+ >>> parse_name_comp("weird[name][m]")
+ ('weird[name]', 'm')
+ """
+ match = _NAME_COMP_RE.match(token.strip())
+ if match:
+ return match.group("name").strip(), match.group("comp").strip()
+ return token.strip(), None
diff --git a/src/raven_python/utils/sort.py b/src/raven_python/utils/sort.py
new file mode 100644
index 0000000..a8641a8
--- /dev/null
+++ b/src/raven_python/utils/sort.py
@@ -0,0 +1,21 @@
+"""Sort a model's identifiers alphabetically — useful for deterministic,
+diff-friendly output.
+
+cobra's ``DictList.sort`` reorders one list (and rebuilds its lookup index), but
+there is no single "sort the whole model" call; this provides it.
+"""
+from __future__ import annotations
+
+import cobra
+
+
+def sort_identifiers(model: cobra.Model) -> cobra.Model:
+ """Sort reactions, metabolites and genes alphabetically by ID, in place.
+
+ Returns the same (mutated) model for convenience. Compartments are a plain
+ dict and are emitted sorted by writers as needed.
+ """
+ model.reactions.sort(key=lambda r: r.id)
+ model.metabolites.sort(key=lambda m: m.id)
+ model.genes.sort(key=lambda g: g.id)
+ return model
diff --git a/src/raven_python/utils/validate.py b/src/raven_python/utils/validate.py
new file mode 100644
index 0000000..c08df48
--- /dev/null
+++ b/src/raven_python/utils/validate.py
@@ -0,0 +1,86 @@
+"""Curation checks for a model.
+
+A QC bundle cobra has no single call for: orphaned objects, empty reactions,
+duplicated metabolite ``name + compartment``, empty names, and objective sanity.
+:func:`check_model` returns these as structured :class:`ModelIssue` records.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import cobra
+
+
+@dataclass(frozen=True)
+class ModelIssue:
+ """One curation issue found in a model.
+
+ Attributes
+ ----------
+ category
+ Machine-readable kind, e.g. ``"orphan_metabolite"``, ``"empty_reaction"``,
+ ``"orphan_gene"``, ``"duplicate_name_compartment"``,
+ ``"empty_metabolite_name"``, ``"objective"``.
+ object_id
+ ID of the offending object, or ``None`` for model-level issues.
+ message
+ Human-readable description.
+ """
+
+ category: str
+ object_id: str | None
+ message: str
+
+
+def check_model(model: cobra.Model) -> list[ModelIssue]:
+ """Run curation checks on a model and return the issues found.
+
+ Does not
+ raise; returns a (possibly empty) list of :class:`ModelIssue`.
+ """
+ issues: list[ModelIssue] = []
+
+ for met in model.metabolites:
+ if not met.reactions:
+ issues.append(
+ ModelIssue("orphan_metabolite", met.id, f"Metabolite {met.id!r} is not used in any reaction.")
+ )
+ if not (met.name and str(met.name).strip()):
+ issues.append(
+ ModelIssue("empty_metabolite_name", met.id, f"Metabolite {met.id!r} has no name.")
+ )
+
+ for gene in model.genes:
+ if not gene.reactions:
+ issues.append(
+ ModelIssue("orphan_gene", gene.id, f"Gene {gene.id!r} is not associated with any reaction.")
+ )
+
+ for rxn in model.reactions:
+ if not rxn.metabolites:
+ issues.append(
+ ModelIssue("empty_reaction", rxn.id, f"Reaction {rxn.id!r} has no metabolites.")
+ )
+
+ by_name_comp: dict[tuple[str, str], list[str]] = {}
+ for met in model.metabolites:
+ by_name_comp.setdefault((met.name, met.compartment), []).append(met.id)
+ for (name, comp), ids in by_name_comp.items():
+ if name and len(ids) > 1:
+ issues.append(
+ ModelIssue(
+ "duplicate_name_compartment",
+ None,
+ f"{len(ids)} metabolites share name {name!r} in compartment {comp!r}: {sorted(ids)}",
+ )
+ )
+
+ objective_rxns = [r.id for r in model.reactions if r.objective_coefficient != 0]
+ if not objective_rxns:
+ issues.append(ModelIssue("objective", None, "No reaction has a nonzero objective coefficient."))
+ elif len(objective_rxns) > 1:
+ issues.append(
+ ModelIssue("objective", None, f"Multiple objective reactions: {sorted(objective_rxns)}")
+ )
+
+ return issues
diff --git a/tests/data/kegg_dump/compound b/tests/data/kegg_dump/compound
new file mode 100644
index 0000000..a78d176
--- /dev/null
+++ b/tests/data/kegg_dump/compound
@@ -0,0 +1,34 @@
+ENTRY C00001 Compound
+NAME H2O;
+ Water
+FORMULA H2O
+DBLINKS PubChem: 3303
+ ChEBI: 15377
+///
+ENTRY C00002 Compound
+NAME ATP
+FORMULA C10H16N5O13P3
+///
+ENTRY C00003 Compound
+NAME NAD+;
+ NAD
+FORMULA C21H28N7O14P2
+///
+ENTRY C00006 Compound
+NAME NADP+
+FORMULA C21H29N7O17P3
+///
+ENTRY C00031 Compound
+NAME D-Glucose;
+ Grape sugar
+FORMULA C6H12O6
+DBLINKS ChEBI: 4167 17634
+///
+ENTRY C01083 Compound
+NAME alpha,alpha-Trehalose
+FORMULA C12H22O11
+///
+ENTRY C00007 Compound
+NAME Oxygen
+FORMULA O2
+///
diff --git a/tests/data/kegg_dump/compound.inchi b/tests/data/kegg_dump/compound.inchi
new file mode 100644
index 0000000..448312f
--- /dev/null
+++ b/tests/data/kegg_dump/compound.inchi
@@ -0,0 +1 @@
+C00031 InChI=1S/C6H12O6/c7-1-2-3(8)4(9)5(10)6(11)12-2/h2-11H,1H2
diff --git a/tests/data/kegg_dump/genes.pep b/tests/data/kegg_dump/genes.pep
new file mode 100644
index 0000000..f30073d
--- /dev/null
+++ b/tests/data/kegg_dump/genes.pep
@@ -0,0 +1,12 @@
+>bsu:BSU31050 gbsB; choline dehydrogenase
+MKVLAAGGTGYIGSHTVVELLEAGYDVVVLDNLSNGHREAVPKGVPFveqIDLRDREALDR
+>bsu:BSU31060 hypothetical protein
+MKVLAAGGTGYIGSHTVVELLEAGYDVVVLDNLSNGHREAVPKGVPFveqIDLRDREALDX
+>eco:b0001 thrA; aspartokinase
+MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNHLVAMIEKTISGQDA
+>hsa:124 ADH1A; alcohol dehydrogenase 1A
+MSTAGKVIKCKAAVLWELKKPFSIEEVEVAPPKAHEVRIKMVATGICRSDDHVVSGTLVT
+>hsa:125 ADH1B; alcohol dehydrogenase 1B
+MSTAGKVIKCKAAVLWEVKKPFSIEDVEVAPPKAHEVRIKMVATGICRSDDHVVSGTLVT
+>xxx:unused some other gene not in any KO
+MAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
diff --git a/tests/data/kegg_dump/ko b/tests/data/kegg_dump/ko
new file mode 100644
index 0000000..f6ae027
--- /dev/null
+++ b/tests/data/kegg_dump/ko
@@ -0,0 +1,14 @@
+ENTRY K01194 KO
+NAME treA, TREH
+DEFINITION alpha,alpha-trehalase [EC:3.2.1.28]
+GENES BSU: BSU31050(gbsB) BSU31060
+ HSA: 124 125(ADH)
+///
+ENTRY K00002 KO
+DEFINITION AKR1A1; alcohol dehydrogenase (NADP+) [EC:1.1.1.2]
+GENES ECO: b0001
+///
+ENTRY K99999 KO
+DEFINITION unlinked ortholog
+GENES ECO: b9999
+///
diff --git a/tests/data/kegg_dump/reaction b/tests/data/kegg_dump/reaction
new file mode 100644
index 0000000..b0e75c5
--- /dev/null
+++ b/tests/data/kegg_dump/reaction
@@ -0,0 +1,31 @@
+ENTRY R00010 Reaction
+NAME alpha,alpha-trehalose glucohydrolase
+DEFINITION alpha,alpha-Trehalose + H2O <=> 2 D-Glucose
+EQUATION C01083 + C00001 <=> 2 C00031
+ENZYME 3.2.1.28
+PATHWAY rn00500 Starch and sucrose metabolism
+ rn01100 Metabolic pathways
+MODULE M00599 example module
+ORTHOLOGY K01194 alpha,alpha-trehalase [EC:3.2.1.28]
+DBLINKS RHEA: 32678
+///
+ENTRY R00100 Reaction
+NAME spontaneous example
+COMMENT This reaction is spontaneous.
+EQUATION C00002 <=> C00003
+ORTHOLOGY K00002 some enzyme
+///
+ENTRY R00200 Reaction
+NAME undefined stoich example
+EQUATION C00001 + n C00002 <=> C00003
+///
+ENTRY R00300 Reaction
+NAME general example
+COMMENT General reaction.
+EQUATION C00031 <=> C00006
+ORTHOLOGY K09999 lumped ortholog
+///
+ENTRY R00400 Reaction
+NAME empty after cancellation
+EQUATION C00007 <=> C00007
+///
diff --git a/tests/data/kegg_dump/reaction_mapformula.lst b/tests/data/kegg_dump/reaction_mapformula.lst
new file mode 100644
index 0000000..0adb8f0
--- /dev/null
+++ b/tests/data/kegg_dump/reaction_mapformula.lst
@@ -0,0 +1,3 @@
+R00010: 00500: C01083 => C00031
+R00010: 00010: C00031 => C01083
+R00100: 00010: C00002 => C00003
diff --git a/tests/data/kegg_dump/taxonomy b/tests/data/kegg_dump/taxonomy
new file mode 100644
index 0000000..f0447e6
--- /dev/null
+++ b/tests/data/kegg_dump/taxonomy
@@ -0,0 +1,10 @@
+# Prokaryotes
+## Bacteria
+### Firmicutes
+T00010 bsu Bacillus subtilis 168 Bacillus
+### Gammaproteobacteria - Enterobacteria
+T00007 eco Escherichia coli K-12 MG1655 Escherichia
+# Eukaryotes
+## Animals
+### Vertebrates - Mammals
+T01001 hsa Homo sapiens (human) Homo
diff --git a/tests/test_binaries.py b/tests/test_binaries.py
new file mode 100644
index 0000000..d74ce0b
--- /dev/null
+++ b/tests/test_binaries.py
@@ -0,0 +1,80 @@
+"""Tests for raven_python.binaries (binary resolution + bundled-ZIP provisioning)."""
+import hashlib
+import shutil
+import zipfile
+from pathlib import Path
+
+import pytest
+
+from raven_python import binaries
+
+
+def test_resolve_explicit_path():
+ assert binaries.resolve_binary("blastp", binary="/opt/x/blastp") == "/opt/x/blastp"
+
+
+def test_resolve_env_var(monkeypatch):
+ monkeypatch.setenv("RAVEN_PYTHON_DIAMOND", "/custom/diamond")
+ assert binaries.resolve_binary("diamond") == "/custom/diamond"
+
+
+@pytest.mark.skipif(not shutil.which("blastp"), reason="blastp not installed")
+def test_resolve_via_path():
+ assert binaries.resolve_binary("blastp") == shutil.which("blastp")
+
+
+def test_resolve_unresolvable_raises(monkeypatch):
+ monkeypatch.setattr(shutil, "which", lambda _: None)
+ with pytest.raises(FileNotFoundError, match="Could not find"):
+ binaries.resolve_binary("diamond") # empty registry, not on PATH
+
+
+def test_platform_key_format():
+ key = binaries.platform_key()
+ assert "-" in key
+ os_part, arch = key.split("-", 1)
+ assert os_part in {"linux", "macos", "windows"} or os_part # tolerant
+
+
+def test_ensure_binary_downloads_verifies_extracts(tmp_path, monkeypatch):
+ # Build a fake bundle ZIP containing an executable, served via file:// URL.
+ exe = tmp_path / "footool"
+ exe.write_text("#!/bin/sh\necho hi\n")
+ archive = tmp_path / "footool.zip"
+ with zipfile.ZipFile(archive, "w") as zf:
+ zf.write(exe, "footool")
+ sha = hashlib.sha256(archive.read_bytes()).hexdigest()
+
+ registry = {
+ "footool": {
+ "version": "1.0",
+ "provides": ["footool"],
+ "platforms": {binaries.platform_key(): {"url": archive.as_uri(), "sha256": sha}},
+ }
+ }
+ monkeypatch.setenv("XDG_CACHE_HOME", str(tmp_path / "cache"))
+
+ path = binaries.ensure_binary("footool", registry=registry)
+ assert Path(path).exists()
+ assert Path(path).name == "footool"
+ # cached on second call (same path, no re-download needed)
+ assert binaries.ensure_binary("footool", registry=registry) == path
+
+
+def test_ensure_binary_sha_mismatch(tmp_path, monkeypatch):
+ archive = tmp_path / "x.zip"
+ with zipfile.ZipFile(archive, "w") as zf:
+ zf.writestr("footool", "data")
+ registry = {
+ "footool": {"version": "1", "provides": ["footool"],
+ "platforms": {binaries.platform_key(): {"url": archive.as_uri(), "sha256": "deadbeef"}}}
+ }
+ monkeypatch.setenv("XDG_CACHE_HOME", str(tmp_path / "cache"))
+ with pytest.raises(ValueError, match="SHA256 mismatch"):
+ binaries.ensure_binary("footool", registry=registry)
+
+
+def test_ensure_binary_unhosted_platform_raises(tmp_path):
+ registry = {"footool": {"version": "1", "provides": ["footool"], "platforms": {}}}
+ with pytest.raises(FileNotFoundError, match="No bundled"):
+ binaries.ensure_binary("footool", registry=registry)
diff --git a/tests/test_change_grrules.py b/tests/test_change_grrules.py
new file mode 100644
index 0000000..d33f723
--- /dev/null
+++ b/tests/test_change_grrules.py
@@ -0,0 +1,49 @@
+"""Tests for change_gene_reaction_rules (changeGrRules port)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import add_reactions_from_equations, change_gene_reaction_rules
+
+
+@pytest.fixture
+def model():
+ m = cobra.Model("t")
+ m.add_metabolites(
+ [cobra.Metabolite("a_c", compartment="c"), cobra.Metabolite("b_c", compartment="c")]
+ )
+ add_reactions_from_equations(
+ m,
+ [
+ {"id": "R1", "equation": "a_c --> b_c", "gene_reaction_rule": "G1"},
+ {"id": "R2", "equation": "a_c --> b_c"},
+ ],
+ )
+ return m
+
+
+def test_replace_rule_and_create_genes(model):
+ (rxn,) = change_gene_reaction_rules(model, {"R1": "G2 and G3"})
+ assert rxn.gene_reaction_rule == "G2 and G3"
+ assert {g.id for g in rxn.genes} == {"G2", "G3"}
+ assert {"G2", "G3"} <= {g.id for g in model.genes}
+
+
+def test_append_rule(model):
+ change_gene_reaction_rules(model, {"R1": "G4"}, replace=False)
+ # (G1) or (G4), normalised by cobra
+ assert model.reactions.get_by_id("R1").gene_reaction_rule == "G1 or G4"
+
+
+def test_append_when_empty_is_just_new(model):
+ change_gene_reaction_rules(model, {"R2": "G5"}, replace=False)
+ assert model.reactions.get_by_id("R2").gene_reaction_rule == "G5"
+
+
+def test_batch(model):
+ changed = change_gene_reaction_rules(model, {"R1": "GA", "R2": "GB"})
+ assert [r.id for r in changed] == ["R1", "R2"]
+
+
+def test_unknown_reaction_errors(model):
+ with pytest.raises(ValueError, match="not found"):
+ change_gene_reaction_rules(model, {"NOPE": "G1"})
diff --git a/tests/test_data.py b/tests/test_data.py
new file mode 100644
index 0000000..714c3a9
--- /dev/null
+++ b/tests/test_data.py
@@ -0,0 +1,89 @@
+"""Tests for ensure_data (data.py). Uses file:// URLs to avoid the network."""
+import hashlib
+
+import pytest
+
+from raven_python.data import (
+ CORE_KEGG_FILES,
+ ensure_data_file,
+ ensure_kegg_data,
+)
+
+
+def _sha256(data: bytes) -> str:
+ return hashlib.sha256(data).hexdigest()
+
+
+@pytest.fixture
+def served(tmp_path, monkeypatch):
+ """A fake registry served from local files, with the cache pointed at tmp."""
+ src = tmp_path / "src"
+ src.mkdir()
+ payloads = {
+ "reference_model.yml.gz": b"!!omap model bytes",
+ "ko_reaction.tsv.gz": b"ko\treaction\n",
+ "ko_names.tsv.gz": b"ko\tname\n",
+ "organism_gene_ko.tsv.xz": b"organism\tgene\tko\n",
+ "rxn_flags.tsv.gz": b"reaction\tspontaneous\n",
+ }
+ files = {}
+ for name, data in payloads.items():
+ path = src / name
+ path.write_bytes(data)
+ files[name] = {"url": path.as_uri(), "sha256": _sha256(data)}
+ registry = {"kegg": {"version": "v1", "files": files}}
+
+ cache = tmp_path / "cache"
+ monkeypatch.setenv("XDG_CACHE_HOME", str(cache))
+ return registry, cache, payloads
+
+
+def test_ensure_data_file_downloads_and_caches(served):
+ registry, cache, payloads = served
+ path = ensure_data_file("kegg", "ko_reaction.tsv.gz", registry=registry)
+ assert path == cache / "raven_python" / "data" / "kegg-v1" / "ko_reaction.tsv.gz"
+ assert path.read_bytes() == payloads["ko_reaction.tsv.gz"]
+
+
+def test_ensure_data_file_reuses_cache(served, monkeypatch):
+ registry, _, _ = served
+ first = ensure_data_file("kegg", "ko_names.tsv.gz", registry=registry)
+ # Break the URL: a second call must hit the cache, not re-download.
+ registry["kegg"]["files"]["ko_names.tsv.gz"]["url"] = "file:///nonexistent"
+ second = ensure_data_file("kegg", "ko_names.tsv.gz", registry=registry)
+ assert first == second and second.exists()
+
+
+def test_sha256_mismatch_rejected(served):
+ registry, cache, _ = served
+ registry["kegg"]["files"]["rxn_flags.tsv.gz"]["sha256"] = "0" * 64
+ with pytest.raises(ValueError, match="SHA256 mismatch"):
+ ensure_data_file("kegg", "rxn_flags.tsv.gz", registry=registry)
+ # The corrupt partial download must not be left behind.
+ assert not (cache / "raven_python" / "data" / "kegg-v1" / "rxn_flags.tsv.gz").exists()
+
+
+def test_unknown_dataset_actionable_error(served):
+ registry, _, _ = served
+ with pytest.raises(FileNotFoundError, match="No data artefacts registered"):
+ ensure_data_file("metacyc", "x", registry=registry)
+
+
+def test_unknown_file_lists_available(served):
+ registry, _, _ = served
+ with pytest.raises(FileNotFoundError, match="not registered"):
+ ensure_data_file("kegg", "missing.tsv.gz", registry=registry)
+
+
+def test_ensure_kegg_data_fetches_core_set(served):
+ registry, cache, _ = served
+ out = ensure_kegg_data(registry=registry)
+ assert out == cache / "raven_python" / "data" / "kegg-v1"
+ for name in CORE_KEGG_FILES:
+ assert (out / name).is_file()
+
+
+def test_empty_registry_raises():
+ # The shipped registry is empty until artefacts are published.
+ with pytest.raises(FileNotFoundError, match="No data artefacts registered"):
+ ensure_data_file("kegg", "ko_reaction.tsv.gz")
diff --git a/tests/test_gapfilling.py b/tests/test_gapfilling.py
new file mode 100644
index 0000000..b92e982
--- /dev/null
+++ b/tests/test_gapfilling.py
@@ -0,0 +1,109 @@
+"""Tests for connectivity gap-filling (gapfilling/fill.py, Phase 4b)."""
+import cobra
+import pytest
+
+from raven_python.gapfilling import GapFillResult, connect_blocked_reactions
+
+
+def _met(mid):
+ return cobra.Metabolite(mid, name=mid, compartment="c")
+
+
+@pytest.fixture
+def draft_and_template():
+ """Draft: EX_A -> A -> B (r1), but B has no consumer, so r1 is blocked.
+
+ Template supplies B -> C (r2) and an exchange for C, which unblocks r1.
+ """
+ A, B = _met("A_c"), _met("B_c")
+ draft = cobra.Model("draft")
+ exa = cobra.Reaction("EX_A", lower_bound=-10, upper_bound=1000)
+ exa.add_metabolites({A: 1})
+ r1 = cobra.Reaction("r1", lower_bound=0, upper_bound=1000) # A -> B, irreversible
+ r1.add_metabolites({A: -1, B: 1})
+ draft.add_reactions([exa, r1])
+
+ template = cobra.Model("template")
+ r2 = cobra.Reaction("r2", lower_bound=0, upper_bound=1000) # B -> C
+ r2.add_metabolites({_met("B_c"): -1, _met("C_c"): 1})
+ exc = cobra.Reaction("EX_C", lower_bound=-1000, upper_bound=1000)
+ exc.add_metabolites({_met("C_c"): -1})
+ extra = cobra.Reaction("r_unneeded", lower_bound=0, upper_bound=1000) # D -> E, irrelevant
+ extra.add_metabolites({_met("D_c"): -1, _met("E_c"): 1})
+ template.add_reactions([r2, exc, extra])
+ return draft, template
+
+
+# --------------------------------------------------------------------------- #
+# Connectivity gap-fill
+# --------------------------------------------------------------------------- #
+def test_fill_gaps_connects_blocked_reaction(draft_and_template):
+ draft, template = draft_and_template
+ assert "r1" in cobra.flux_analysis.find_blocked_reactions(draft) # precondition
+
+ res = connect_blocked_reactions(draft, template)
+ assert isinstance(res, GapFillResult)
+ assert "r1" in res.newly_connected
+ assert set(res.added_reactions) == {"r2", "EX_C"} # both needed to drain B
+ assert "r_unneeded" not in res.added_reactions # irrelevant template rxn not added
+
+
+def test_fill_gaps_returns_working_model_that_unblocks(draft_and_template):
+ draft, template = draft_and_template
+ res = connect_blocked_reactions(draft, template)
+ assert {"r2", "EX_C"} <= {r.id for r in res.model.reactions}
+ assert "r1" not in cobra.flux_analysis.find_blocked_reactions(res.model)
+ # original draft is untouched
+ assert "r2" not in {r.id for r in draft.reactions}
+
+
+def test_fill_gaps_nothing_to_do_when_unblocked(draft_and_template):
+ draft, template = draft_and_template
+ # give the draft its own drain so r1 is not blocked
+ drain = cobra.Reaction("EX_B", lower_bound=-1000, upper_bound=1000)
+ drain.add_metabolites({draft.metabolites.B_c: -1})
+ draft.add_reactions([drain])
+ res = connect_blocked_reactions(draft, template)
+ assert res.added_reactions == []
+ assert res.newly_connected == []
+
+
+def test_fill_gaps_scores_prefer_higher_scored_reactions():
+ # Two alternative single-reaction drains for B; scores should pick the preferred one.
+ A, B = _met("A_c"), _met("B_c")
+ draft = cobra.Model("draft")
+ exa = cobra.Reaction("EX_A", lower_bound=-10, upper_bound=1000)
+ exa.add_metabolites({A: 1})
+ r1 = cobra.Reaction("r1", lower_bound=0, upper_bound=1000)
+ r1.add_metabolites({A: -1, B: 1})
+ draft.add_reactions([exa, r1])
+ template = cobra.Model("t")
+ d1 = cobra.Reaction("drain1", lower_bound=-1000, upper_bound=1000)
+ d1.add_metabolites({_met("B_c"): -1})
+ d2 = cobra.Reaction("drain2", lower_bound=-1000, upper_bound=1000)
+ d2.add_metabolites({_met("B_c"): -1})
+ template.add_reactions([d1, d2])
+ # Scores are penalties (higher = preferred = cheaper to include); only one drain
+ # is needed, so the less-penalised drain1 is chosen.
+ res = connect_blocked_reactions(draft, template, scores={"drain1": -1.0, "drain2": -5.0})
+ assert res.added_reactions == ["drain1"]
+
+
+def test_unconnectable_reaction_reported_not_added():
+ # A blocked irreversible reaction that no template can connect: reported, no adds.
+ A, B = _met("A_c"), _met("B_c")
+ draft = cobra.Model("draft")
+ exa = cobra.Reaction("EX_A", lower_bound=-10, upper_bound=1000)
+ exa.add_metabolites({A: 1})
+ r1 = cobra.Reaction("r1", lower_bound=0, upper_bound=1000) # A -> B, B has no drain
+ r1.add_metabolites({A: -1, B: 1})
+ draft.add_reactions([exa, r1])
+ template = cobra.Model("t") # offers nothing that can drain B
+ noise = cobra.Reaction("noise", lower_bound=0, upper_bound=1000)
+ noise.add_metabolites({_met("X_c"): -1, _met("Y_c"): 1})
+ template.add_reactions([noise])
+
+ res = connect_blocked_reactions(draft, template)
+ assert res.added_reactions == []
+ assert res.newly_connected == []
+ assert "r1" in res.cannot_connect
diff --git a/tests/test_init.py b/tests/test_init.py
new file mode 100644
index 0000000..a61ee19
--- /dev/null
+++ b/tests/test_init.py
@@ -0,0 +1,110 @@
+"""Tests for the INIT MILP (init/init.py, Phase 4c)."""
+import cobra
+import pytest
+
+from raven_python.init import InitResult, run_init
+
+
+def _met(mid):
+ return cobra.Metabolite(mid, name=mid[:-2] if mid.endswith("_c") else mid, compartment="c")
+
+
+@pytest.fixture
+def model():
+ """EX_A -> A -(r1)-> B -(r2)-> C -(r3)-> D, with A uptake and excretion allowed.
+
+ r1, r2 are good (positive score); r3 is bad (negative score).
+ """
+ m = cobra.Model("net")
+ A, B, C, D = _met("A_c"), _met("B_c"), _met("C_c"), _met("D_c")
+ m.add_metabolites([A, B, C, D])
+ exa = cobra.Reaction("EX_A", lower_bound=-1000, upper_bound=1000)
+ exa.add_metabolites({A: -1}) # negative flux = uptake of A
+ r1 = cobra.Reaction("r1", lower_bound=0, upper_bound=1000)
+ r1.add_metabolites({A: -1, B: 1})
+ r2 = cobra.Reaction("r2", lower_bound=0, upper_bound=1000)
+ r2.add_metabolites({B: -1, C: 1})
+ r3 = cobra.Reaction("r3", lower_bound=0, upper_bound=1000)
+ r3.add_metabolites({C: -1, D: 1})
+ m.add_reactions([exa, r1, r2, r3])
+ return m
+
+
+def test_keeps_positive_drops_negative(model):
+ scores = {"r1": 1.0, "r2": 1.0, "r3": -1.0}
+ res = run_init(model, scores, prod_weight=0.0, allow_excretion=True)
+ assert isinstance(res, InitResult)
+ kept = {r.id for r in res.model.reactions}
+ assert {"r1", "r2"} <= kept # positive-score, flux-consistent -> kept
+ assert "r3" in res.deleted_reactions # negative score -> removed
+ assert "r3" not in kept
+
+
+def test_negative_scores_emptied_when_no_reward(model):
+ # All reactions negative and no production reward -> keep nothing (empty optimum).
+ scores = {r.id: -1.0 for r in model.reactions}
+ res = run_init(model, scores, prod_weight=0.0, allow_excretion=True)
+ assert res.deleted_reactions == sorted(r.id for r in model.reactions)
+ assert len(res.model.reactions) == 0
+
+
+def test_essential_reaction_forced_kept(model):
+ # r3 is negative-scored but essential -> must be kept despite the penalty.
+ scores = {"r1": 1.0, "r2": 1.0, "r3": -1.0}
+ res = run_init(model, scores, essential_rxns=["r3"], prod_weight=0.0, allow_excretion=True)
+ kept = {r.id for r in res.model.reactions}
+ assert "r3" in kept
+ assert "r3" not in res.deleted_reactions
+
+
+def test_prod_weight_pulls_in_connectivity(model):
+ # With everything scored 0, no reward -> empty. With prod_weight>0, producing
+ # metabolites is rewarded, so flux-carrying reactions are pulled in.
+ zero = {r.id: 0.0 for r in model.reactions}
+ empty = run_init(model, zero, prod_weight=0.0, allow_excretion=True)
+ assert len(empty.model.reactions) == 0
+ pulled = run_init(model, zero, prod_weight=0.5, allow_excretion=True)
+ assert len(pulled.model.reactions) > 0
+
+
+def test_present_mets_reports_producibility(model):
+ scores = {"r1": 1.0, "r2": 1.0}
+ res = run_init(
+ model, scores, present_mets=["C", "Z"], prod_weight=0.0, allow_excretion=True
+ )
+ assert res.met_production["C"] is True # A->B->C is producible
+ assert res.met_production["Z"] is False # not in the model
+
+
+def test_objective_returned(model):
+ res = run_init(model, {"r1": 1.0, "r2": 1.0, "r3": -1.0}, prod_weight=0.0, allow_excretion=True)
+ assert res.objective == pytest.approx(2.0) # kept r1(+1) + r2(+1), dropped r3
+
+
+def test_reversible_essential_keeps_productive_path():
+ """A reversible essential reaction must not be forced into a phantom fwd+rev loop.
+
+ SRC -> a, R: a <=> b (reversible, essential), SNK: b ->. Forcing R essential
+ should keep the productive path SRC->R->SNK, not delete SRC/SNK and leave R
+ self-looping (the bug from forcing eps flux through both split directions).
+ """
+ import cobra
+
+ m = cobra.Model("revess")
+ a, b = (cobra.Metabolite(x, compartment="c") for x in "ab")
+ m.add_metabolites([a, b])
+ src = cobra.Reaction("SRC", lower_bound=0, upper_bound=1000)
+ src.add_metabolites({a: 1})
+ r = cobra.Reaction("R", lower_bound=-1000, upper_bound=1000)
+ r.add_metabolites({a: -1, b: 1})
+ snk = cobra.Reaction("SNK", lower_bound=0, upper_bound=1000)
+ snk.add_metabolites({b: -1})
+ m.add_reactions([src, r, snk])
+ m.objective = "SNK"
+
+ res = run_init(m, {"SRC": -1.0, "SNK": -1.0}, essential_rxns=["R"], prod_weight=0.0)
+ kept = {rxn.id for rxn in res.model.reactions}
+ assert "R" in kept
+ # The productive path must be kept (SRC feeds R, SNK drains it); R can't self-loop.
+ assert {"SRC", "SNK"} <= kept
+ assert res.model.slim_optimize() > 1e-6 # the kept model actually carries flux
diff --git a/tests/test_init_build.py b/tests/test_init_build.py
new file mode 100644
index 0000000..cbc566f
--- /dev/null
+++ b/tests/test_init_build.py
@@ -0,0 +1,132 @@
+"""Tests for tINIT scoring + get_init_model (init/score.py, init/build.py)."""
+import math
+
+import cobra
+import pytest
+
+from raven_python.init import (
+ InitModelResult,
+ gene_scores_from_expression,
+ get_init_model,
+ score_reactions_from_genes,
+)
+
+
+# --------------------------------------------------------------------------- #
+# score_reactions_from_genes
+# --------------------------------------------------------------------------- #
+@pytest.fixture
+def gpr_model():
+ m = cobra.Model("g")
+ a = cobra.Metabolite("a_c", compartment="c")
+ b = cobra.Metabolite("b_c", compartment="c")
+ m.add_metabolites([a, b])
+ r_complex = cobra.Reaction("r_complex") # (g1 and g2) or g3
+ r_complex.add_metabolites({a: -1, b: 1})
+ m.add_reactions([r_complex])
+ r_complex.gene_reaction_rule = "(g1 and g2) or g3"
+ r_nogene = cobra.Reaction("r_nogene")
+ r_nogene.add_metabolites({b: -1})
+ m.add_reactions([r_nogene])
+ return m
+
+
+def test_score_isozyme_max_complex_min(gpr_model):
+ # (g1 and g2) or g3 -> max(min(1, 4), 3) = max(1, 3) = 3
+ scores = score_reactions_from_genes(gpr_model, {"g1": 1.0, "g2": 4.0, "g3": 3.0})
+ assert scores["r_complex"] == 3.0
+
+
+def test_score_no_gene_reaction_gets_default(gpr_model):
+ scores = score_reactions_from_genes(gpr_model, {"g1": 1, "g2": 1, "g3": 1}, no_gene_score=-2.0)
+ assert scores["r_nogene"] == -2.0
+
+
+def test_score_missing_genes_omitted(gpr_model):
+ # g2 missing -> complex (g1 and g2) collapses to g1=1; OR with g3=3 -> max(1,3)=3
+ scores = score_reactions_from_genes(gpr_model, {"g1": 1.0, "g3": 3.0})
+ assert scores["r_complex"] == 3.0
+ # all genes missing -> no_gene_score
+ assert score_reactions_from_genes(gpr_model, {})["r_complex"] == -2.0
+
+
+def test_score_invalid_method(gpr_model):
+ with pytest.raises(ValueError, match="isozyme_scoring"):
+ score_reactions_from_genes(gpr_model, {}, isozyme_scoring="nonsense")
+
+
+# --------------------------------------------------------------------------- #
+# gene_scores_from_expression (RNA-seq path)
+# --------------------------------------------------------------------------- #
+def test_expression_scores_sign_and_clamp():
+ expr = {"hi": 100.0, "lo": 1.0, "mid": 10.0, "zero": 0.0}
+ ref = 10.0 # threshold/reference
+ s = gene_scores_from_expression(expr, ref)
+ assert s["hi"] == pytest.approx(min(5 * math.log(10), 10.0)) # above ref -> positive
+ assert s["lo"] == pytest.approx(max(5 * math.log(0.1), -5.0)) # below ref -> negative
+ assert s["mid"] == pytest.approx(0.0) # at ref -> 0
+ assert s["zero"] == -5.0 # non-positive -> floor
+
+
+def test_expression_per_gene_reference():
+ expr = {"g": 20.0}
+ s = gene_scores_from_expression(expr, {"g": 5.0})
+ assert s["g"] == pytest.approx(5 * math.log(4))
+
+
+# --------------------------------------------------------------------------- #
+# get_init_model pipeline
+# --------------------------------------------------------------------------- #
+@pytest.fixture
+def model():
+ m = cobra.Model("net")
+ A, B, C, D = (cobra.Metabolite(x, name=x[:-2], compartment="c") for x in ("A_c", "B_c", "C_c", "D_c"))
+ m.add_metabolites([A, B, C, D])
+ exa = cobra.Reaction("EX_A", lower_bound=-1000, upper_bound=1000)
+ exa.add_metabolites({A: -1})
+ r1 = cobra.Reaction("r1", lower_bound=0, upper_bound=1000)
+ r1.add_metabolites({A: -1, B: 1})
+ r2 = cobra.Reaction("r2", lower_bound=0, upper_bound=1000)
+ r2.add_metabolites({B: -1, C: 1})
+ r3 = cobra.Reaction("r3", lower_bound=0, upper_bound=1000)
+ r3.add_metabolites({C: -1, D: 1})
+ m.add_reactions([exa, r1, r2, r3])
+ for r, rule in (("r1", "g1"), ("r2", "g2"), ("r3", "g3")):
+ m.reactions.get_by_id(r).gene_reaction_rule = rule
+ return m
+
+
+def test_get_init_model_from_gene_scores(model):
+ # g1,g2 expressed (positive), g3 not (negative) -> keep r1,r2, drop r3.
+ res = get_init_model(model, gene_scores={"g1": 5.0, "g2": 5.0, "g3": -5.0}, prod_weight=0.0)
+ assert isinstance(res, InitModelResult)
+ kept = {r.id for r in res.model.reactions}
+ assert {"r1", "r2"} <= kept
+ assert "r3" not in kept
+ assert res.reaction_scores["r1"] == 5.0
+
+
+def test_get_init_model_requires_one_score_source(model):
+ with pytest.raises(ValueError, match="exactly one"):
+ get_init_model(model)
+ with pytest.raises(ValueError, match="exactly one"):
+ get_init_model(model, rxn_scores={}, gene_scores={})
+
+
+def test_get_init_model_essential_kept(model):
+ # r3 negative-scored but essential -> kept.
+ res = get_init_model(
+ model, rxn_scores={"r1": 1, "r2": 1, "r3": -1}, essential_rxns=["r3"], prod_weight=0.0
+ )
+ assert "r3" in {r.id for r in res.model.reactions}
+
+
+def test_get_init_model_removes_dead_ends(model):
+ # An isolated reaction that can never carry flux is dropped as a dead end.
+ X, Y = cobra.Metabolite("X_c", compartment="c"), cobra.Metabolite("Y_c", compartment="c")
+ dead = cobra.Reaction("dead", lower_bound=0, upper_bound=1000)
+ dead.add_metabolites({X: -1, Y: 1}) # X has no source, Y no sink (no exchange)
+ model.add_reactions([dead])
+ res = get_init_model(model, rxn_scores={"r1": 1, "r2": 1}, prod_weight=0.0)
+ assert "dead" in res.deleted_dead_end_reactions
+ assert "dead" not in {r.id for r in res.model.reactions}
diff --git a/tests/test_init_ftinit.py b/tests/test_init_ftinit.py
new file mode 100644
index 0000000..58f0542
--- /dev/null
+++ b/tests/test_init_ftinit.py
@@ -0,0 +1,139 @@
+"""Phase 4d.3: the single-step ftINIT MILP (run_ftinit).
+
+Validated on the testModel oracle against (a) a hand-checked score-optimal solution,
+(b) the formulation invariants, and (c) exact agreement with the already-tested
+run_init. The full-pipeline RAVEN outputs (tinitTests T0001/T0002) additionally
+involve linear merge + the toIgnore masks + staging + exchange re-adding, layered on
+in 4d.2/4d.3b/4d.5.
+
+Note on the toy result: with strict mass balance and no metabolite-production reward
+(ftINIT, unlike classic INIT, only rewards metabolomics-detected mets), the
+score-optimal subnetwork on testModel is the internal cycle R4→R6→(R10 rev)→(R9 rev),
+worth 7+0.5-3+3.5 = 8.0 — it beats the "honest" exchange path because that path must
+pay for the negative-score transport reactions R2/R7. The bare INIT MILP has no
+loopless constraint (neither does RAVEN's); loop-free models come from the staged
+pipeline + exchange handling and, at genome scale, from models having real exchanges
+so such cycles are not score-optimal. This faithfully matches RAVEN's MILP.
+"""
+import cobra
+import pytest
+from tinit_oracles import TEST_MODEL_SCORES, expr_for_rxn_score, make_test_model
+
+from raven_python.init import FtInitResult, run_ftinit, run_init
+from raven_python.init.score import gene_scores_from_expression, score_reactions_from_genes
+
+_LOOP = {"R4", "R6", "R9", "R10"} # the score-optimal subnetwork (8.0)
+
+
+def _scores(model):
+ expr = expr_for_rxn_score(TEST_MODEL_SCORES)
+ return score_reactions_from_genes(model, gene_scores_from_expression(expr, 1.0))
+
+
+def test_full_milp_score_optimum():
+ model = make_test_model()
+ res = run_ftinit(model, _scores(model))
+ assert isinstance(res, FtInitResult)
+ assert set(res.kept_reactions) == _LOOP
+ assert res.deleted_reactions == ["R1", "R2", "R3", "R5", "R7", "R8"]
+ assert res.objective == pytest.approx(8.0, abs=1e-6)
+
+
+def test_kept_reactions_carry_flux_and_balance():
+ """Indicator-on reactions carry flux (≥ force_on) and the solution is steady-state."""
+ model = make_test_model()
+ res = run_ftinit(model, _scores(model))
+ for rid in res.kept_reactions:
+ assert abs(res.fluxes[rid]) > 1e-9
+ # The extracted model is itself feasible/flux-consistent.
+ assert res.model.slim_optimize() is not None
+
+
+def test_agrees_with_run_init():
+ """Exact agreement with the classic INIT MILP (no production reward, no rev loops).
+
+ run_init splits reversibles and double-scores both directions unless no_rev_loops,
+ so we compare under matching settings: same objective and same kept set.
+ """
+ model = make_test_model()
+ scores = _scores(model)
+ ft = run_ftinit(model, scores)
+ init = run_init(model, scores, prod_weight=0.0, eps=0.1, no_rev_loops=True)
+ assert set(ft.kept_reactions) == {r.id for r in init.model.reactions}
+ assert ft.objective == pytest.approx(init.objective, abs=1e-6)
+
+
+def test_essential_force_clamps_to_capacity():
+ """Forcing an essential reaction is clamped to its capacity (no lb>ub crash).
+
+ A reaction capped at 0.05 forced with the default 0.1 must not error; it is forced
+ to its capacity (0.05) and the model stays feasible. A per-reaction force of 0.04
+ forces exactly that.
+ """
+ m = cobra.Model("cap")
+ a, b = (cobra.Metabolite(x, compartment="s") for x in "ab")
+ m.add_metabolites([a, b])
+ r = cobra.Reaction("LOW", lower_bound=0, upper_bound=0.05) # tiny capacity
+ r.add_metabolites({a: -1, b: 1})
+ for mid, st in [("EX_a", {a: -1}), ("EX_b", {b: -1})]:
+ ex = cobra.Reaction(mid, lower_bound=-1000, upper_bound=1000)
+ ex.add_metabolites(st)
+ m.add_reactions([ex])
+ m.add_reactions([r])
+ m.objective = "LOW"
+
+ res = run_ftinit(m, {}, essential_rxns=["LOW"], force_on_ess=0.1) # clamped to 0.05
+ assert res.fluxes["LOW"] >= 0.05 - 1e-9
+ res2 = run_ftinit(m, {}, essential_rxns=["LOW"], essential_force={"LOW": 0.04})
+ assert res2.fluxes["LOW"] >= 0.04 - 1e-9
+
+
+def test_essential_reaction_forced_on():
+ """An essential reaction is kept and carries flux even when its score is negative."""
+ model = make_test_model()
+ res = run_ftinit(model, _scores(model), essential_rxns=["R3"])
+ assert "R3" in res.kept_reactions
+ assert abs(res.fluxes["R3"]) > 1e-6
+
+
+def test_rem_pos_rev_drops_positive_reversibles():
+ """rem_pos_rev frees positive reversibles (score→0): the score-8.0 loop collapses.
+
+ R4 (+7) and R10 (+3.5) are positive reversibles; with them unscored, the cycle is
+ no longer profitable (R6 0.5 - R9 3 < 0), so nothing scored stays on.
+ """
+ model = make_test_model()
+ res = run_ftinit(model, _scores(model), rem_pos_rev=True)
+ assert res.objective == pytest.approx(0.0, abs=1e-6)
+ assert "R6" not in res.kept_reactions and "R9" not in res.kept_reactions
+
+
+def test_allow_excretion_relaxes_balance():
+ """With allow_excretion the result stays feasible (net production permitted)."""
+ model = make_test_model()
+ res = run_ftinit(model, _scores(model), allow_excretion=True)
+ assert res.objective >= 8.0 - 1e-6 # at least as good as strict balance
+
+
+def test_unscored_reactions_are_kept_free():
+ """Score-0 reactions are left in the model (not removable), not deleted."""
+ model = make_test_model()
+ scores = _scores(model)
+ scores["R3"] = 0.0 # make R3 unscored -> must not be deleted
+ res = run_ftinit(model, scores)
+ assert "R3" not in res.deleted_reactions
+
+
+def test_forced_flux_lower_bound_is_respected():
+ """A scored, non-reversible reaction with lb>0 must keep carrying >= lb flux.
+
+ Guards the bound handling: the single-direction branch must use the model's own
+ [lb, ub], not zero out a positive lower bound.
+ """
+ model = make_test_model()
+ scores = _scores(model)
+ # R6 (2 d[c] => e[c]) is forward-irreversible; force >=2 flux through it.
+ model.reactions.get_by_id("R6").lower_bound = 2.0
+ res = run_ftinit(model, scores)
+ assert res.fluxes["R6"] >= 2.0 - 1e-6
+ assert "R6" not in res.deleted_reactions
diff --git a/tests/test_init_genes.py b/tests/test_init_genes.py
new file mode 100644
index 0000000..862ca17
--- /dev/null
+++ b/tests/test_init_genes.py
@@ -0,0 +1,71 @@
+"""Phase 4d.5: remove_low_score_genes — the three RAVEN docstring oracle cases.
+
+Scores use distinct values to avoid the random tie-break RAVEN mentions when all
+isozyme alternatives are negative.
+"""
+import cobra
+
+from raven_python.init import remove_low_score_genes
+
+
+def _model(rule: str) -> cobra.Model:
+ m = cobra.Model("g")
+ a = cobra.Metabolite("a", compartment="c")
+ b = cobra.Metabolite("b", compartment="c")
+ r = cobra.Reaction("R", lower_bound=0, upper_bound=1000)
+ r.add_metabolites({a: -1, b: 1})
+ m.add_reactions([r])
+ r.gene_reaction_rule = rule
+ return m
+
+
+def _norm(rule: str) -> str:
+ """cobra's normalized form of a GPR string, for order/paren-insensitive comparison."""
+ return _model(rule).reactions.R.gene_reaction_rule
+
+
+def _result(rule: str, scores: dict) -> str:
+ out, _ = remove_low_score_genes(_model(rule), scores)
+ return out.reactions.R.gene_reaction_rule
+
+
+def test_case1_isozyme_vs_complex():
+ """G1 or (G2 and G3 and G4); G1,G2 negative → keep the complex."""
+ # G1 more negative than G2 so the complex (= G2's score under min) is least-negative.
+ scores = {"G1": -2.0, "G2": -1.0, "G3": 1.0, "G4": 1.0}
+ assert _result("G1 or (G2 and G3 and G4)", scores) == _norm("G2 and G3 and G4")
+
+
+def test_case2_two_complexes():
+ """G1 or (G2 and G3) or (G4 and G5); G1,G2 negative → keep the positive complex."""
+ scores = {"G1": -1.0, "G2": -1.0, "G3": 1.0, "G4": 1.0, "G5": 1.0}
+ assert _result("G1 or (G2 and G3) or (G4 and G5)", scores) == _norm("G4 and G5")
+
+
+def test_case3_nested_isozyme_in_complex():
+ """(G1 and (G2 or G3) and G4); G2 negative → prune G2 from the inner isozyme group."""
+ scores = {"G1": 1.0, "G2": -1.0, "G3": 1.0, "G4": 1.0}
+ assert _result("G1 and (G2 or G3) and G4", scores) == _norm("G1 and G3 and G4")
+
+
+def test_complex_subunit_not_removed_individually():
+ """A negative subunit of a pure complex stays (the whole complex is kept)."""
+ scores = {"G1": 1.0, "G2": -1.0}
+ assert _result("G1 and G2", scores) == _norm("G1 and G2")
+
+
+def test_single_negative_gene_kept():
+ """A reaction's only gene is never removed (≥1 must remain)."""
+ assert _result("G1", {"G1": -5.0}) == "G1"
+
+
+def test_unscored_genes_not_removed():
+ """Genes absent from the score map are treated as unscored and not removed."""
+ scores = {"G1": -1.0} # G2 unscored
+ assert _result("G1 or G2", scores) == _norm("G2") # only the negative G1 dropped
+
+
+def test_removed_genes_reported_and_pruned():
+ out, removed = remove_low_score_genes(_model("G1 or G2"), {"G1": -1.0, "G2": 1.0})
+ assert removed == ["G1"]
+ assert "G1" not in {g.id for g in out.genes}
diff --git a/tests/test_init_merge.py b/tests/test_init_merge.py
new file mode 100644
index 0000000..f6fea8a
--- /dev/null
+++ b/tests/test_init_merge.py
@@ -0,0 +1,109 @@
+"""Phase 4d.2: linear reaction merging (merge_linear + group_rxn_scores).
+
+Oracles: RAVEN tinitTests T0004. testModel merges {R1,R2},{R3,R5},{R4,R6},{R7,R8},
+{R9,R10}; testModel4 merges {R5,R6},{R7,R8},{R9,R10} with two reactions flipped.
+"""
+import pytest
+from tinit_oracles import (
+ TEST_MODEL4_GROUP_IDS,
+ TEST_MODEL4_MERGED_REV,
+ TEST_MODEL4_REVERSED_RXNS,
+ TEST_MODEL_GROUP_IDS,
+ TEST_MODEL_GROUPED_SCORES,
+ TEST_MODEL_MERGED_LB,
+ TEST_MODEL_MERGED_REV,
+ TEST_MODEL_SCORES,
+ make_test_model,
+ make_test_model4,
+)
+
+from raven_python.init import group_rxn_scores, merge_linear
+
+
+def test_test_model_group_ids():
+ _, orig_ids, group_ids, _ = merge_linear(make_test_model())
+ assert orig_ids == [f"R{i}" for i in range(1, 11)]
+ assert group_ids == TEST_MODEL_GROUP_IDS # [1,1,2,3,2,3,4,4,5,5]
+
+
+def test_test_model_reduced_shape():
+ reduced, _, _, _ = merge_linear(make_test_model())
+ # Five merged reactions, survivors keep the producer's id, original order.
+ assert [r.id for r in reduced.reactions] == ["R1", "R3", "R4", "R7", "R9"]
+ assert [int(r.lower_bound < 0) for r in reduced.reactions] == TEST_MODEL_MERGED_REV
+ assert [r.lower_bound for r in reduced.reactions] == TEST_MODEL_MERGED_LB
+
+
+def test_test_model_grouped_scores():
+ reduced, orig_ids, group_ids, _ = merge_linear(make_test_model())
+ scores = dict(zip(orig_ids, TEST_MODEL_SCORES, strict=True))
+ grouped = group_rxn_scores(reduced, scores, orig_ids, group_ids,
+ to_zero={"R1", "R2", "R8"})
+ got = [grouped[r.id] for r in reduced.reactions]
+ assert got == pytest.approx(TEST_MODEL_GROUPED_SCORES) # [0,-0.5,7.5,-1,0.5]
+
+
+def test_test_model4_group_ids_and_flips():
+ reduced, orig_ids, group_ids, reversed_rxns = merge_linear(make_test_model4())
+ assert group_ids == TEST_MODEL4_GROUP_IDS # [0,0,0,0,1,1,2,2,3,3,0]
+ assert [int(r.lower_bound < 0) for r in reduced.reactions] == TEST_MODEL4_MERGED_REV
+ flipped = {oid for oid, rev in zip(orig_ids, reversed_rxns, strict=True) if rev}
+ assert flipped == set(TEST_MODEL4_REVERSED_RXNS) # {R6, R9}
+
+
+def test_merge_preserves_feasible_space():
+ """The reduced model admits flux through the merged export path, like the original.
+
+ The reduced model carries no objective (merging drops genes and objective; ftINIT
+ sets its own from scores), so we set one on the surviving export reaction. R8
+ (e[s]=>) was merged into R7 (grp4), so R7 is the reduced export.
+ """
+ original = make_test_model()
+ assert original.slim_optimize() > 1e-9 # exports e via R8
+ reduced, _, _, _ = merge_linear(original)
+ reduced.objective = "R7"
+ assert reduced.slim_optimize() > 1e-9
+
+
+def test_no_merge_blocks_merging():
+ """A reaction in no_merge keeps its own group (id 0) and is not contracted."""
+ _, orig_ids, group_ids, _ = merge_linear(make_test_model(), no_merge=["R2"])
+ g = dict(zip(orig_ids, group_ids, strict=True))
+ assert g["R2"] == 0 # R2 never merged
+ # R1 was only mergeable with R2, so it stays unmerged too.
+ assert g["R1"] == 0
+
+
+def test_multipass_chain_collapses_to_one_group():
+ """A 3-reaction chain A→X→Y→Z collapses to one reaction (exercises multi-pass).
+
+ X is degree-2 (r1,r2), Y degree-2 (r2,r3); A and Z are degree-1 (retained). Merging
+ X makes Y newly degree-2 with the survivor, caught on a later pass. Confluence: all
+ three reactions end in one group, leaving the net A→Z reaction.
+ """
+ import cobra
+
+ m = cobra.Model("chain")
+ A, X, Y, Z = (cobra.Metabolite(i, name=i, compartment="c") for i in "AXYZ")
+ m.add_metabolites([A, X, Y, Z])
+ for rid, stoich in [("r1", {A: -1, X: 1}), ("r2", {X: -1, Y: 1}), ("r3", {Y: -1, Z: 1})]:
+ r = cobra.Reaction(rid, lower_bound=0, upper_bound=1000)
+ r.add_metabolites(stoich)
+ m.add_reactions([r])
+
+ reduced, orig_ids, group_ids, _ = merge_linear(m)
+ assert len(reduced.reactions) == 1 # collapsed to net A -> Z
+ assert len(set(group_ids)) == 1 and group_ids[0] != 0 # all three in one group
+ only = reduced.reactions[0]
+ assert {mt.id: c for mt, c in only.metabolites.items()} == {"A": -1.0, "Z": 1.0}
+
+
+def test_group_scores_zero_handling():
+ """Genuine-zero score → 0.01; a group cancelling to zero with nonzero members → 0.01."""
+ reduced, orig_ids, group_ids, _ = merge_linear(make_test_model())
+ # Give group {R3,R5} scores that cancel: R3=+1, R5=-1 -> sum 0 but members nonzero.
+ scores = dict.fromkeys(orig_ids, 0.0)
+ scores["R3"], scores["R5"] = 1.0, -1.0
+ grouped = group_rxn_scores(reduced, scores, orig_ids, group_ids)
+ assert grouped["R3"] == pytest.approx(0.01) # cancelled group rescued
+ assert grouped["R4"] == pytest.approx(0.02) # {R4,R6} both genuine-0 → 0.01+0.01
diff --git a/tests/test_init_oracles.py b/tests/test_init_oracles.py
new file mode 100644
index 0000000..3f3e52d
--- /dev/null
+++ b/tests/test_init_oracles.py
@@ -0,0 +1,64 @@
+"""Validate the ftINIT toy oracles and that our scoring reproduces RAVEN's.
+
+This is Phase 4d.0: the correctness scaffold. The (ft)INIT MILP itself is not yet
+ported, so the on/off-output oracles in tinit_oracles live there as constants for the
+later sub-phases; here we lock down the pieces that already exist — the score→
+expression inversion and scoreComplexModel-equivalent scoring (RAVEN tinitTests
+T0009).
+"""
+import pytest
+from tinit_oracles import (
+ TEST_MODEL4_SCORES,
+ TEST_MODEL5_SCORES,
+ TEST_MODEL_SCORES,
+ expr_for_rxn_score,
+ make_test_model,
+ make_test_model4,
+ make_test_model5,
+)
+
+from raven_python.init.score import gene_scores_from_expression, score_reactions_from_genes
+
+
+@pytest.mark.parametrize(
+ "make_model, scores",
+ [
+ (make_test_model, TEST_MODEL_SCORES),
+ (make_test_model4, TEST_MODEL4_SCORES),
+ (make_test_model5, TEST_MODEL5_SCORES),
+ ],
+)
+def test_scoring_reproduces_defined_scores(make_model, scores):
+ """RAVEN T0009: expr_for_rxn_score → scoreComplexModel round-trips the scores."""
+ model = make_model()
+ expression = expr_for_rxn_score(scores)
+ gene_scores = gene_scores_from_expression(expression, 1.0)
+ rxn_scores = score_reactions_from_genes(model, gene_scores)
+ got = [rxn_scores[r.id] for r in model.reactions]
+ assert got == pytest.approx(scores, abs=1e-10)
+
+
+def test_expr_for_rxn_score_inverts_scoring():
+ """level = exp(score/5); 5·ln(level/1) recovers the score."""
+ scores = [-5, -1, 0.5, 7, 10]
+ expr = expr_for_rxn_score(scores)
+ recovered = gene_scores_from_expression(expr, 1.0)
+ assert [recovered[f"G{i + 1}"] for i in range(len(scores))] == pytest.approx(scores)
+
+
+def test_test_model_structure():
+ """Sanity: shapes, no-GPR reactions, reversibility, objective."""
+ m = make_test_model()
+ assert len(m.reactions) == 10 and len(m.metabolites) == 8
+ no_gpr = {r.id for r in m.reactions if not r.genes}
+ assert no_gpr == {"R1", "R2", "R8"} # the reactions scored -2 (no gene)
+ rev = {r.id for r in m.reactions if r.lower_bound < 0}
+ assert rev == {"R2", "R3", "R4", "R9", "R10"}
+ assert m.objective.expression.as_coefficients_dict() # objective set (R8)
+
+
+def test_test_model_is_feasible_for_the_task():
+ """The toy model can actually make e[s] from a[s] (so the task oracle is meaningful)."""
+ m = make_test_model()
+ m.objective = "R8"
+ assert m.slim_optimize() > 1e-6
diff --git a/tests/test_init_pipeline.py b/tests/test_init_pipeline.py
new file mode 100644
index 0000000..bf2a2ac
--- /dev/null
+++ b/tests/test_init_pipeline.py
@@ -0,0 +1,161 @@
+"""Phase 4d.3b: the staged ftINIT pipeline (prep_init_model + get_init_steps + ftinit).
+
+Oracles: RAVEN tinitTests T0001/T0002 on testModel with the default '1+1' schedule.
+"""
+
+from tinit_oracles import (
+ TEST_MODEL_FTINIT_NO_TASKS,
+ TEST_MODEL_FTINIT_SPONT_R7_R10,
+ TEST_MODEL_FTINIT_WITH_TASK,
+ TEST_MODEL_SCORES,
+ TEST_MODEL_TASK_ESSENTIAL_MERGED,
+ expr_for_rxn_score,
+ make_test_model,
+ make_test_task,
+)
+
+from raven_python.init import (
+ classify_reactions,
+ ftinit,
+ get_init_steps,
+ prep_init_model,
+ score_reactions_from_genes,
+)
+from raven_python.init.score import gene_scores_from_expression
+
+
+def _scores(model):
+ return score_reactions_from_genes(
+ model, gene_scores_from_expression(expr_for_rxn_score(TEST_MODEL_SCORES), 1.0)
+ )
+
+
+# --------------------------------------------------------------------------- #
+# classify_reactions (the toIgnore masks) — tinitTests T0001 mask oracle.
+# --------------------------------------------------------------------------- #
+def test_classify_exchange_and_transport():
+ masks = classify_reactions(make_test_model(), ext_comp="s")
+ assert masks.exchange == {"R1", "R8"} # boundary reactions
+ assert masks.import_rxns == {"R2"} # a[s] <=> a[c], no GPR, into ext comp
+ assert masks.no_gpr == {"R1", "R2", "R8"}
+ assert "R7" not in masks.import_rxns # R7 has a GPR -> not a transport category
+
+
+def test_classify_spontaneous():
+ masks = classify_reactions(make_test_model(), ext_comp="s", spontaneous=["R7", "R10"])
+ assert masks.exchange | masks.spontaneous == {"R1", "R7", "R8", "R10"}
+
+
+def test_get_init_steps_default():
+ steps = get_init_steps("1+1")
+ assert len(steps) == 2
+ assert steps[0].how_to_use_prev == "ignore"
+ assert steps[0].ignore_mask == (1, 1, 1, 1, 1, 1, 1, 0)
+ assert steps[1].how_to_use_prev == "essential"
+ assert steps[1].ignore_mask == (1, 0, 0, 0, 1, 0, 0, 0)
+ assert len(get_init_steps("full")) == 1
+
+
+# --------------------------------------------------------------------------- #
+# Full '1+1' pipeline — T0001 (no tasks) and T0002 (with task).
+# --------------------------------------------------------------------------- #
+def test_ftinit_no_tasks_matches_oracle():
+ """T0001: testModel, no tasks, '1+1' → {R1,R4,R6,R8,R9,R10}."""
+ model = make_test_model()
+ prep = prep_init_model(model, ext_comp="s")
+ out = ftinit(prep, _scores(model))
+ assert {r.id for r in out.reactions} == set(TEST_MODEL_FTINIT_NO_TASKS)
+
+
+def test_ftinit_with_spontaneous_matches_oracle():
+ """T0001 variant: R7,R10 spontaneous → the path through R2/R7, {R1,R2,R4,R6,R7,R8}."""
+ model = make_test_model()
+ prep = prep_init_model(model, ext_comp="s", spontaneous=["R7", "R10"])
+ out = ftinit(prep, _scores(model))
+ assert {r.id for r in out.reactions} == set(TEST_MODEL_FTINIT_SPONT_R7_R10)
+
+
+def test_ftinit_with_task_matches_oracle():
+ """T0002: task 'make e[s] from a[s]' makes R2,R7 essential → {R1,R2,R4,R6,R7,R8,R9,R10}."""
+ model = make_test_model()
+ prep = prep_init_model(model, [make_test_task()], ext_comp="s")
+ # Essentials map to merged ids {R1, R7} (RAVEN T0002).
+ assert prep.essential_rxns == set(TEST_MODEL_TASK_ESSENTIAL_MERGED)
+ out = ftinit(prep, _scores(model))
+ assert {r.id for r in out.reactions} == set(TEST_MODEL_FTINIT_WITH_TASK)
+
+
+def test_full_series_runs():
+ """The single-step 'full' series also produces a feasible subnetwork."""
+ model = make_test_model()
+ prep = prep_init_model(model, ext_comp="s")
+ out = ftinit(prep, _scores(model), series="full")
+ assert len(out.reactions) >= 1
+
+
+def test_pipeline_with_gene_scores_and_tasks_wires_up():
+ """ftinit accepts gene_scores (gene pruning) + tasks (gap-fill) without breaking T0002.
+
+ The toy's GPRs are single-gene (nothing to prune) and the task is feasible in the
+ extracted model (nothing to gap-fill), so the reaction set is unchanged — this
+ confirms the integration wiring (the pruning/gap-fill logic is unit-tested
+ separately in test_init_genes / test_init_taskfill).
+ """
+ model = make_test_model()
+ gene_scores = gene_scores_from_expression(expr_for_rxn_score(TEST_MODEL_SCORES), 1.0)
+ prep = prep_init_model(model, [make_test_task()], ext_comp="s")
+ out = ftinit(prep, _scores(model), gene_scores=gene_scores)
+ assert {r.id for r in out.reactions} == set(TEST_MODEL_FTINIT_WITH_TASK)
+
+
+def test_orient_forward_reverses_a_reversible_reaction():
+ """_orient_forward(rxn, -1) flips stoichiometry and makes it irreversible forward."""
+ import cobra
+
+ from raven_python.init.prep import _orient_forward
+
+ m = cobra.Model("o")
+ a, b = (cobra.Metabolite(x, compartment="s") for x in "ab")
+ m.add_metabolites([a, b])
+ r = cobra.Reaction("R", lower_bound=-800, upper_bound=1000)
+ r.add_metabolites({a: -1, b: 2}) # a <=> 2 b
+ m.add_reactions([r])
+
+ _orient_forward(r, -1) # forced reverse → becomes forward
+ assert r.bounds == (0, 800) # [-800,1000] → flip [-1000,800] → lb→0
+ assert {mt.id: c for mt, c in r.metabolites.items()} == {"a": 1, "b": -2} # 2 b => a
+
+ fwd = cobra.Reaction("F", lower_bound=-500, upper_bound=900)
+ fwd.add_metabolites({a: -1})
+ m.add_reactions([fwd])
+ _orient_forward(fwd, 1) # forced forward → just made irreversible
+ assert fwd.bounds == (0, 900)
+
+
+def test_essential_merged_away_is_skipped():
+ """An essential reaction whose merge group collapses away imposes no constraint.
+
+ REV sits between two exchanges, so it merges with them into a trivial source→sink
+ that is removed; its group has no survivor. prep_init_model must skip it, not crash.
+ """
+ import cobra
+
+ from raven_python.tasks import Task
+
+ m = cobra.Model("collapse")
+ a, b = (cobra.Metabolite(x, name=x, compartment="s") for x in "ab")
+ m.add_metabolites([a, b])
+ r = cobra.Reaction("REV", lower_bound=-1000, upper_bound=1000)
+ r.add_metabolites({a: -1, b: 1})
+ r.gene_reaction_rule = "g1"
+ exchanges = []
+ for met in (a, b):
+ ex = cobra.Reaction(f"EX_{met.id}", lower_bound=-1000, upper_bound=1000)
+ ex.add_metabolites({met: -1})
+ exchanges.append(ex)
+ m.add_reactions([r, *exchanges])
+ m.objective = "REV"
+ task = Task(id="mk_a", inputs=[("b[s]", 0.0, 1000.0)], outputs=[("a[s]", 1.0, 1.0)])
+
+ prep = prep_init_model(m, [task], ext_comp="s") # must not raise
+ assert "REV" not in prep.essential_rxns # merged into a collapsed group
diff --git a/tests/test_init_solvers.py b/tests/test_init_solvers.py
new file mode 100644
index 0000000..514c408
--- /dev/null
+++ b/tests/test_init_solvers.py
@@ -0,0 +1,149 @@
+"""Cross-solver smoke tests for the (f)tINIT MILP path.
+
+The clean-data calibration and robustness studies were run on Gurobi; the tractability
+choices (big-M=100, MIP gap, time limits) and the Gurobi-specific param plumbing
+(``opt.problem.Params.MIPGap``) only matter if those choices also work on the *other*
+MILP backends real users have. These tests assert that each available MILP-capable
+optlang interface produces the same reaction-set verdict as Gurobi on the toy models the
+unit tests use — so a regression in solver portability fails CI instead of being found
+months later on a user's machine.
+
+Solvers tested: every MILP-capable cobra/optlang interface that imports in this env
+(Gurobi, HiGHS via ``hybrid``, GLPK). Missing ones are skipped automatically. Genome-scale
+behaviour is measured separately by ``scripts/analyze_init_solvers.py`` (manual benchmark).
+"""
+from __future__ import annotations
+
+import importlib
+
+import cobra
+import pytest
+
+from raven_python.init import ftinit, prep_init_model, run_ftinit, run_init
+from raven_python.tasks import Task, check_tasks
+
+# Detect which MILP-capable optlang interfaces actually work; skip the rest.
+# We do a real import (not just find_spec) because optlang ships every backend's
+# module file but those that wrap third-party solvers (gurobi, cplex) only import
+# cleanly when the underlying solver is installed — find_spec would say "present"
+# and then we'd crash at fixture time on CI runners without Gurobi.
+_INTERFACES = {"gurobi": "gurobi_interface", "hybrid": "hybrid_interface", "glpk": "glpk_interface"}
+
+
+def _solver_available(modname: str) -> bool:
+ try:
+ importlib.import_module(f"optlang.{modname}")
+ return True
+ except ImportError:
+ return False
+
+
+_AVAILABLE = [name for name, mod in _INTERFACES.items() if _solver_available(mod)]
+
+# Known upstream blocker: ``optlang.hybrid_interface.Configuration.clone()`` rejects
+# ``lp_method='primal'``. Marked strict so this flips red when optlang is fixed and
+# we should drop the marker. See docs/init_solver_benchmark.md.
+_XFAIL = {"hybrid": pytest.mark.xfail(
+ reason="optlang hybrid_interface.Configuration rejects lp_method='primal' (upstream)",
+ strict=True, raises=ValueError,
+)}
+
+
+def _param(name: str):
+ marks = [_XFAIL[name]] if name in _XFAIL else []
+ return pytest.param(name, marks=marks, id=name)
+
+
+@pytest.fixture(params=[_param(n) for n in _AVAILABLE])
+def solver(request):
+ """One installed MILP solver per parameter value."""
+ return request.param
+
+
+# ----------------------------------------------------------------------- toy fixtures
+
+def _met(mid, comp="c"):
+ return cobra.Metabolite(mid, name=mid.split("_")[0], compartment=comp)
+
+
+def _toy_init_model() -> cobra.Model:
+ """EX_A → A → B → C → D (r1, r2 good; r3 bad). Same network as test_init.py."""
+ def rxn(rid, lb, ub, mets):
+ r = cobra.Reaction(rid, lower_bound=lb, upper_bound=ub)
+ r.add_metabolites(mets)
+ return r
+ m = cobra.Model("toy")
+ A, B, C, D = (_met(x) for x in ("A_c", "B_c", "C_c", "D_c"))
+ m.add_metabolites([A, B, C, D])
+ m.add_reactions([rxn("EX_A", -1000, 1000, {A: -1}),
+ rxn("r1", 0, 1000, {A: -1, B: 1}),
+ rxn("r2", 0, 1000, {B: -1, C: 1}),
+ rxn("r3", 0, 1000, {C: -1, D: 1})])
+ return m
+
+
+def _toy_ftinit_model() -> cobra.Model:
+ """Small flux-consistent network for ftINIT: A→B, B→C, parallel A→C (negative-score)."""
+ def rxn(rid, lb, ub, mets):
+ r = cobra.Reaction(rid, lower_bound=lb, upper_bound=ub)
+ r.add_metabolites(mets)
+ return r
+ m = cobra.Model("ftoy")
+ A, B, C = (_met(x) for x in ("A_c", "B_c", "C_c"))
+ m.add_metabolites([A, B, C])
+ m.add_reactions([rxn("EX_A", -1000, 0, {A: -1}),
+ rxn("EX_C", 0, 1000, {C: -1}),
+ rxn("r1", 0, 1000, {A: -1, B: 1}),
+ rxn("r2", 0, 1000, {B: -1, C: 1}),
+ rxn("rbad", 0, 1000, {A: -1, C: 1})])
+ return m
+
+
+# --------------------------------------------------------------------- tests
+
+def test_run_init_same_verdict(solver):
+ """tINIT MILP on a small network drops the negative-score reaction with any solver."""
+ m = _toy_init_model()
+ m.solver = solver
+ res = run_init(m, {"r1": 1.0, "r2": 1.0, "r3": -1.0}, prod_weight=0.0, allow_excretion=True)
+ assert "r3" in res.deleted_reactions
+ assert sorted(set(r.id for r in res.model.reactions)) == ["EX_A", "r1", "r2"]
+
+
+def test_run_ftinit_same_verdict(solver):
+ """ftINIT MILP picks the same on-set across solvers on a small network."""
+ m = _toy_ftinit_model()
+ m.solver = solver
+ res = run_ftinit(m, {"r1": 1.0, "r2": 1.0, "rbad": -1.0}, allow_excretion=True)
+ assert "rbad" not in res.on_reactions
+ assert {"r1", "r2"}.issubset(res.on_reactions)
+
+
+def test_check_tasks_works_per_solver(solver):
+ """check_tasks (one slim_optimize per task) works with each solver."""
+ m = _toy_ftinit_model()
+ m.solver = solver
+ task = Task(id="make_c", inputs=[("A[c]", 0.0, 1000.0)], outputs=[("C[c]", 1.0, 1.0)])
+ results = check_tasks(m, [task])
+ assert results[0].passed
+
+
+def test_ftinit_pipeline_with_tasks(solver):
+ """The full ftinit() pipeline (prep + staged MILP + gap-fill) runs with each solver."""
+ m = _toy_ftinit_model()
+ m.solver = solver
+ task = Task(id="make_c", inputs=[("A[c]", 0.0, 1000.0)], outputs=[("C[c]", 1.0, 1.0)])
+ prep = prep_init_model(m, [task])
+ out = ftinit(prep, {"r1": 1.0, "r2": 1.0, "rbad": -1.0}, series="1+1")
+ # Functional: the target task remains satisfiable in the extracted model.
+ assert check_tasks(out, [task])[0].passed
+
+
+def test_solver_param_plumbing(solver):
+ """mip_gap / time_limit reach the solver without raising (graceful per backend)."""
+ m = _toy_ftinit_model()
+ m.solver = solver
+ # Tight time limit + loose gap on a trivial problem; just verify the call returns.
+ res = run_ftinit(m, {"r1": 1.0, "rbad": -1.0}, allow_excretion=True,
+ mip_gap=0.05, time_limit=60)
+ assert res.objective is not None
diff --git a/tests/test_init_taskfill.py b/tests/test_init_taskfill.py
new file mode 100644
index 0000000..c975f41
--- /dev/null
+++ b/tests/test_init_taskfill.py
@@ -0,0 +1,83 @@
+"""Phase 4d.4: task gap-filling (fill_tasks).
+
+Oracle: RAVEN tinitTests T0003. Remove the exchange reactions and create a gap by
+deleting R7 (e[c] -> e[s]); gap-filling against the full reference must add R7 back so
+the task 'make e[s] from a[s]' becomes feasible again.
+"""
+from tinit_oracles import make_test_model, make_test_task
+
+from raven_python.init import TaskFillResult, fill_tasks
+
+
+def _reference_without_exchanges():
+ """testModel with the exchange reactions (R1, R8) removed — the gap-fill template."""
+ ref = make_test_model()
+ ref.remove_reactions(["R1", "R8"], remove_orphans=False)
+ return ref
+
+
+def test_fills_the_gap_with_r7():
+ ref = _reference_without_exchanges()
+ gapped = ref.copy()
+ gapped.remove_reactions(["R7"], remove_orphans=False) # the gap
+ res = fill_tasks(gapped, ref, [make_test_task()])
+ assert isinstance(res, TaskFillResult)
+ assert res.added_reactions == ["R7"]
+ assert "R7" in {r.id for r in res.model.reactions}
+ assert not res.failed_tasks
+
+
+def test_no_fill_when_already_feasible():
+ """A model that can already do the task gets no additions."""
+ ref = _reference_without_exchanges()
+ res = fill_tasks(ref.copy(), ref, [make_test_task()])
+ assert res.added_reactions == []
+
+
+def test_should_fail_tasks_ignored():
+ from raven_python.tasks import Task
+
+ ref = _reference_without_exchanges()
+ gapped = ref.copy()
+ gapped.remove_reactions(["R7"], remove_orphans=False)
+ sf = Task(id="sf", should_fail=True, outputs=[("e[s]", 1.0, 1.0)])
+ res = fill_tasks(gapped, ref, [sf])
+ assert res.added_reactions == [] # should_fail task drives no gap-filling
+
+
+def test_open_exchange_does_not_short_circuit_gapfill():
+ """Boundaries are closed during gap-filling, so an open exchange can't fake feasibility.
+
+ Give the gapped model an open exchange on e[s]; without closing boundaries the task
+ 'produce e[s]' would look feasible (free secretion) and R7 would never be added.
+ """
+ import cobra
+
+ ref = _reference_without_exchanges()
+ gapped = ref.copy()
+ gapped.remove_reactions(["R7"], remove_orphans=False)
+ ex_es = cobra.Reaction("EX_es", lower_bound=-1000, upper_bound=1000)
+ ex_es.add_metabolites({gapped.metabolites.es: -1})
+ gapped.add_reactions([ex_es]) # open exchange that must be ignored
+ res = fill_tasks(gapped, ref, [make_test_task()])
+ assert "R7" in res.added_reactions # gap still detected and filled
+
+
+def test_prefers_cheaper_reactions_by_score():
+ """When two candidates can fill a gap, the higher-scored (cheaper) one is chosen.
+
+ Build a gap that R7 (e[c]->e[s]) OR an alternative ALT (e[c]->e[s]) can fill; give
+ ALT a much better score so it is preferred.
+ """
+ import cobra
+
+ ref = _reference_without_exchanges()
+ alt = cobra.Reaction("ALT", lower_bound=0, upper_bound=1000)
+ alt.add_metabolites({ref.metabolites.ec: -1, ref.metabolites.es: 1}) # same as R7
+ alt.gene_reaction_rule = "gALT"
+ ref.add_reactions([alt])
+ gapped = ref.copy()
+ gapped.remove_reactions(["R7", "ALT"], remove_orphans=False)
+ # ALT scored high (cost low), R7 scored low (cost high) → ALT chosen.
+ res = fill_tasks(gapped, ref, [make_test_task()], rxn_scores={"ALT": 5.0, "R7": -3.0})
+ assert res.added_reactions == ["ALT"]
diff --git a/tests/test_io_excel.py b/tests/test_io_excel.py
new file mode 100644
index 0000000..12434ef
--- /dev/null
+++ b/tests/test_io_excel.py
@@ -0,0 +1,111 @@
+"""Tests for raven_python.io.excel (exportToExcelFormat port, export only)."""
+import cobra
+import pytest
+
+openpyxl = pytest.importorskip("openpyxl")
+
+from raven_python.io import export_to_excel
+from raven_python.manipulation import add_reactions_from_equations
+
+
+@pytest.fixture
+def model():
+ m = cobra.Model("yeastGEM")
+ m.name = "Yeast"
+ m.compartments = {"c": "cytoplasm"}
+ m.notes["metaData"] = {"taxonomy": "taxonomy/559292", "defaultLB": "-1000"}
+ m.add_metabolites(
+ [
+ cobra.Metabolite("atp_c", name="ATP", formula="C10H16N5O13P3", charge=-4, compartment="c"),
+ cobra.Metabolite("adp_c", name="ADP", compartment="c"),
+ ]
+ )
+ m.metabolites.atp_c.annotation = {"kegg.compound": ["C00002"], "smiles": ["C1=NC"]}
+ m.metabolites.atp_c.notes = {"inchis": "InChI=1S/X"}
+ add_reactions_from_equations(
+ m,
+ [{"id": "R1", "equation": "atp_c <=> adp_c", "name": "rxn one",
+ "gene_reaction_rule": "G1", "subsystem": "glycolysis"}],
+ )
+ r = m.reactions.R1
+ r.annotation = {"ec-code": ["1.1.1.1"], "kegg.reaction": ["R00001"]}
+ r.notes = {"confidence_score": 2, "note": "a note", "references": "PMID:1"}
+ r.objective_coefficient = 1
+ return m
+
+
+def _wb(path):
+ return openpyxl.load_workbook(path)
+
+
+def test_sheets_present(model, tmp_path):
+ out = tmp_path / "m.xlsx"
+ export_to_excel(model, out)
+ wb = _wb(out)
+ assert set(wb.sheetnames) == {"RXNS", "METS", "COMPS", "GENES", "MODEL"}
+
+
+def test_rxns_sheet(model, tmp_path):
+ out = tmp_path / "m.xlsx"
+ export_to_excel(model, out)
+ ws = _wb(out)["RXNS"]
+ header = [c.value for c in ws[1]]
+ row = {header[i]: c.value for i, c in enumerate(ws[2])}
+ assert row["ID"] == "R1"
+ assert row["NAME"] == "rxn one"
+ assert "ATP[c]" in row["EQUATION"] and "<=>" in row["EQUATION"]
+ assert row["EC-NUMBER"] == "1.1.1.1"
+ assert row["GENE ASSOCIATION"] == "G1"
+ assert row["SUBSYSTEM"] == "glycolysis"
+ assert row["OBJECTIVE"] == 1
+ assert row["CONFIDENCE SCORE"] == 2
+ assert row["NOTE"] == "a note"
+ assert row["MIRIAM"] == "kegg.reaction/R00001" # ec-code excluded (own column)
+
+
+def test_mets_sheet(model, tmp_path):
+ out = tmp_path / "m.xlsx"
+ export_to_excel(model, out)
+ ws = _wb(out)["METS"]
+ header = [c.value for c in ws[1]]
+ rows = {
+ r[header.index("REPLACEMENT ID")].value: {header[i]: c.value for i, c in enumerate(r)}
+ for r in ws.iter_rows(min_row=2)
+ }
+ atp = rows["atp_c"]
+ assert atp["ID"] == "ATP[c]"
+ assert atp["NAME"] == "ATP"
+ assert atp["InChI"] == "InChI=1S/X"
+ assert atp["COMPOSITION"] is None # suppressed when InChI present
+ assert atp["CHARGE"] == -4
+ assert atp["MIRIAM"] == "kegg.compound/C00002" # smiles excluded
+
+
+def test_model_sheet(model, tmp_path):
+ out = tmp_path / "m.xlsx"
+ export_to_excel(model, out)
+ ws = _wb(out)["MODEL"]
+ header = [c.value for c in ws[1]]
+ row = {header[i]: c.value for i, c in enumerate(ws[2])}
+ assert row["ID"] == "yeastGEM"
+ assert row["NAME"] == "Yeast"
+ assert row["TAXONOMY"] == "taxonomy/559292"
+ assert row["DEFAULT LOWER"] == "-1000"
+
+
+def test_genes_sheet(model, tmp_path):
+ out = tmp_path / "m.xlsx"
+ export_to_excel(model, out)
+ ws = _wb(out)["GENES"]
+ header = [c.value for c in ws[1]]
+ row = {header[i]: c.value for i, c in enumerate(ws[2])}
+ assert row["NAME"] == "G1"
+
+
+def test_no_genes_skips_sheet(tmp_path):
+ m = cobra.Model("t")
+ m.add_metabolites([cobra.Metabolite("a_c", compartment="c")])
+ add_reactions_from_equations(m, [{"id": "R1", "equation": "a_c -->"}])
+ out = tmp_path / "m.xlsx"
+ export_to_excel(m, out)
+ assert "GENES" not in _wb(out).sheetnames
diff --git a/tests/test_io_git.py b/tests/test_io_git.py
new file mode 100644
index 0000000..28881dc
--- /dev/null
+++ b/tests/test_io_git.py
@@ -0,0 +1,69 @@
+"""Tests for raven_python.io.git (exportForGit port)."""
+import cobra
+import pytest
+
+from raven_python.io import export_for_git
+from raven_python.manipulation import add_reactions_from_equations
+
+
+@pytest.fixture
+def model():
+ m = cobra.Model("yeastGEM")
+ m.compartments = {"c": "cytoplasm"}
+ m.add_metabolites(
+ [cobra.Metabolite("atp_c", name="ATP", compartment="c"),
+ cobra.Metabolite("adp_c", name="ADP", compartment="c")]
+ )
+ add_reactions_from_equations(m, [{"id": "R1", "equation": "atp_c <=> adp_c"}])
+ return m
+
+
+def test_standard_gem_layout(model, tmp_path):
+ root = export_for_git(model, tmp_path, prefix="yeast", formats=("yml", "xml", "mat", "xlsx", "txt"))
+ assert root == tmp_path / "model"
+ assert (root / "yml" / "yeast.yml").exists()
+ assert (root / "xml" / "yeast.xml").exists()
+ assert (root / "mat" / "yeast.mat").exists()
+ assert (root / "xlsx" / "yeast.xlsx").exists()
+ assert (root / "txt" / "yeast.txt").exists()
+ assert (root / "dependencies.txt").exists()
+
+
+def test_dependencies_file(model, tmp_path):
+ root = export_for_git(model, tmp_path, formats=("yml",))
+ deps = (root / "dependencies.txt").read_text()
+ assert "python\t" in deps
+ assert "cobra\t" in deps
+ assert "raven_python\t" in deps
+
+
+def test_flat_layout(model, tmp_path):
+ root = export_for_git(model, tmp_path, formats=("yml",), sub_dirs=False)
+ assert root == tmp_path
+ assert (tmp_path / "model.yml").exists()
+
+
+def test_subset_of_formats(model, tmp_path):
+ root = export_for_git(model, tmp_path, formats=("yml", "xml"))
+ assert (root / "yml" / "model.yml").exists()
+ assert not (root / "mat").exists()
+ assert not (root / "xlsx").exists()
+
+
+def test_does_not_mutate_model(model, tmp_path):
+ order_before = [r.id for r in model.reactions]
+ export_for_git(model, tmp_path, formats=("yml",))
+ assert [r.id for r in model.reactions] == order_before
+
+
+def test_txt_table_content(model, tmp_path):
+ root = export_for_git(model, tmp_path, formats=("txt",))
+ txt = (root / "txt" / "model.txt").read_text()
+ assert txt.splitlines()[0].startswith("Rxn name\t")
+ assert "R1" in txt
+ assert "ATP[c]" in txt
+
+
+def test_bad_format(model, tmp_path):
+ with pytest.raises(ValueError, match="Unknown format"):
+ export_for_git(model, tmp_path, formats=("yml", "json"))
diff --git a/tests/test_io_sif.py b/tests/test_io_sif.py
new file mode 100644
index 0000000..d50ad98
--- /dev/null
+++ b/tests/test_io_sif.py
@@ -0,0 +1,82 @@
+"""Tests for raven_python.io.sif (exportModelToSIF port)."""
+import cobra
+import pytest
+
+from raven_python.io import export_model_to_sif
+from raven_python.manipulation import add_reactions_from_equations
+
+
+@pytest.fixture
+def model():
+ m = cobra.Model("t")
+ m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b", "c")])
+ add_reactions_from_equations(
+ m,
+ [
+ {"id": "R1", "equation": "a --> b"},
+ {"id": "R2", "equation": "b --> c"},
+ ],
+ )
+ return m
+
+
+def _lines(path):
+ return [ln.split("\t") for ln in path.read_text().splitlines()]
+
+
+def test_reaction_compound(model, tmp_path):
+ out = tmp_path / "g.sif"
+ export_model_to_sif(model, out, "rc")
+ rows = {r[0]: (r[1], set(r[2:])) for r in _lines(out)}
+ assert rows["R1"] == ("rc", {"a", "b"})
+ assert rows["R2"] == ("rc", {"b", "c"})
+
+
+def test_reaction_reaction(model, tmp_path):
+ out = tmp_path / "g.sif"
+ export_model_to_sif(model, out, "rr")
+ rows = {r[0]: set(r[2:]) for r in _lines(out)}
+ # R1 and R2 share metabolite b
+ assert rows["R1"] == {"R2"}
+ assert rows["R2"] == {"R1"}
+
+
+def test_compound_compound(model, tmp_path):
+ out = tmp_path / "g.sif"
+ export_model_to_sif(model, out, "cc")
+ rows = {r[0]: set(r[2:]) for r in _lines(out)}
+ # a is a substrate of R1 (a->b): a links to product b
+ assert "b" in rows.get("a", set())
+ # b is substrate of R2 (b->c): b links to c
+ assert "c" in rows.get("b", set())
+
+
+def test_custom_labels(model, tmp_path):
+ out = tmp_path / "g.sif"
+ export_model_to_sif(model, out, "rc", reaction_labels={"R1": "Reaction1"})
+ sources = {r[0] for r in _lines(out)}
+ assert "Reaction1" in sources
+ assert "R1" not in sources
+
+
+def test_bad_graph_type(model, tmp_path):
+ with pytest.raises(ValueError, match="graph_type"):
+ export_model_to_sif(model, tmp_path / "g.sif", "xx")
+
+
+def test_cc_does_not_mutate_input(model, tmp_path):
+ n_before = len(model.reactions)
+ export_model_to_sif(model, tmp_path / "g.sif", "cc")
+ assert len(model.reactions) == n_before # convert_to_irreversible ran on a copy
+
+
+# --- regression: label-map collision (known_issues.md B4) ------------------
+
+def test_collapsing_label_map_warns(model, tmp_path):
+ """A label map that sends two distinct ids to the same label silently merges
+ nodes during the target-side dedup. Now warns so the user sees it."""
+ with pytest.warns(UserWarning, match="multiple ids to the same label"):
+ export_model_to_sif(
+ model, tmp_path / "g.sif", "rc",
+ reaction_labels={"R1": "shared", "R2": "shared"},
+ )
diff --git a/tests/test_io_yaml.py b/tests/test_io_yaml.py
new file mode 100644
index 0000000..510af5f
--- /dev/null
+++ b/tests/test_io_yaml.py
@@ -0,0 +1,202 @@
+"""Tests for raven_python.io.yaml against the RAVEN fa281a1 (cobra-native !!omap) schema."""
+from pathlib import Path
+
+import cobra
+import pytest
+from cobra.io.yaml import yaml as cobra_yaml
+
+from raven_python.io import read_yaml_model, write_yaml_model
+
+# A model laid out exactly as RAVEN writeYAMLmodel (fa281a1) emits: cobra-native
+# structure, RAVEN-only fields as top-level per-entry keys, smiles/ec-code inside
+# the annotation block, metaData provenance-only, id/name/version top-level.
+RAVEN_DOC = {
+ "metabolites": [
+ {
+ "id": "s_0001",
+ "name": "ATP",
+ "compartment": "c",
+ "formula": "C10H16N5O13P3",
+ "charge": -4,
+ "inchis": "InChI=1S/CH4",
+ "deltaG": 12.5,
+ "notes": "a metabolite note",
+ "metFrom": "KEGG",
+ "annotation": {"kegg.compound": ["C00002"], "smiles": ["C1=NC2"]},
+ },
+ {"id": "s_0002", "name": "ADP", "compartment": "c"},
+ ],
+ "reactions": [
+ {
+ "id": "R1",
+ "name": "rxn one",
+ "metabolites": {"s_0001": -1, "s_0002": 1},
+ "lower_bound": -1000.0,
+ "upper_bound": 1000.0,
+ "gene_reaction_rule": "G1",
+ "subsystem": "glycolysis",
+ "confidence_score": 2,
+ "references": "PMID:123",
+ "rxnFrom": "manual",
+ "notes": "a reaction note",
+ "deltaG": -5.0,
+ "annotation": {"ec-code": ["1.1.1.1"]},
+ }
+ ],
+ "genes": [
+ {"id": "G1", "name": "gene one", "protein": "P12345", "annotation": {"uniprot": ["P12345"]}}
+ ],
+ "id": "testModel",
+ "name": "Test Model",
+ "compartments": {"c": "cytoplasm"},
+ "version": "1.0",
+ "metaData": {"date": "2026-05-23", "taxonomy": "taxonomy/559292", "defaultLB": "-1000"},
+ "ec-rxns": [{"id": "R1", "kcat": 100.0}],
+}
+
+
+@pytest.fixture
+def yaml_file(tmp_path) -> Path:
+ p = tmp_path / "model.yml"
+ with open(p, "w", encoding="utf-8") as fh:
+ cobra_yaml.dump(RAVEN_DOC, fh)
+ return p
+
+
+def test_standard_content(yaml_file):
+ model = read_yaml_model(yaml_file)
+ assert model.id == "testModel"
+ assert model.name == "Test Model"
+ assert {m.id for m in model.metabolites} == {"s_0001", "s_0002"}
+ r = model.reactions.get_by_id("R1")
+ assert r.bounds == (-1000.0, 1000.0)
+ assert r.subsystem == "glycolysis"
+ assert r.gene_reaction_rule == "G1"
+
+
+def test_annotation_owned_by_cobra(yaml_file):
+ # smiles / ec-code / miriam live in the annotation block (cobra reads them)
+ model = read_yaml_model(yaml_file)
+ assert model.metabolites.get_by_id("s_0001").annotation["smiles"] == ["C1=NC2"]
+ assert model.metabolites.get_by_id("s_0001").annotation["kegg.compound"] == ["C00002"]
+ assert model.reactions.get_by_id("R1").annotation["ec-code"] == ["1.1.1.1"]
+ assert model.genes.get_by_id("G1").annotation["uniprot"] == ["P12345"]
+
+
+def test_raven_only_fields_captured(yaml_file):
+ model = read_yaml_model(yaml_file)
+ a = model.metabolites.get_by_id("s_0001")
+ assert a.notes["inchis"] == "InChI=1S/CH4"
+ assert a.notes["deltaG"] == 12.5
+ assert a.notes["note"] == "a metabolite note" # RAVEN metNotes string, no crash
+ assert a.notes["metFrom"] == "KEGG"
+ assert "smiles" not in a.notes # smiles stays in annotation
+ r = model.reactions.get_by_id("R1")
+ assert r.notes["confidence_score"] == 2
+ assert r.notes["references"] == "PMID:123"
+ assert r.notes["rxnFrom"] == "manual"
+ assert r.notes["note"] == "a reaction note"
+ assert r.notes["deltaG"] == -5.0
+ assert model.genes.get_by_id("G1").notes["protein"] == "P12345"
+
+
+def test_model_level_extras(yaml_file):
+ model = read_yaml_model(yaml_file)
+ assert model.notes["metaData"]["taxonomy"] == "taxonomy/559292"
+ assert model.notes["version"] == "1.0"
+ assert model.notes["_yaml_sections"]["ec-rxns"][0]["kcat"] == 100.0
+
+
+def test_round_trip(yaml_file, tmp_path):
+ model = read_yaml_model(yaml_file)
+ out = tmp_path / "out.yml"
+ write_yaml_model(model, out)
+ reloaded = read_yaml_model(out)
+
+ assert reloaded.id == "testModel"
+ assert reloaded.notes["version"] == "1.0"
+ assert reloaded.notes["metaData"]["taxonomy"] == "taxonomy/559292"
+ a = reloaded.metabolites.get_by_id("s_0001")
+ assert a.notes["deltaG"] == 12.5
+ assert a.notes["note"] == "a metabolite note"
+ assert a.annotation["smiles"] == ["C1=NC2"]
+ r = reloaded.reactions.get_by_id("R1")
+ assert r.notes["confidence_score"] == 2
+ assert reloaded.genes.get_by_id("G1").notes["protein"] == "P12345"
+ assert reloaded.notes["_yaml_sections"]["ec-rxns"][0]["id"] == "R1"
+
+
+def test_extra_notes_not_dropped_when_free_text_note_present(yaml_file, tmp_path):
+ """An entry with both a RAVEN free-text note and an extra note keeps both on write."""
+ model = read_yaml_model(yaml_file)
+ a = model.metabolites.get_by_id("s_0001")
+ a.notes["note"] = "free text"
+ a.notes["custom"] = "extra value" # a non-RAVEN note that must not be silently lost
+ out = tmp_path / "out.yml"
+ write_yaml_model(model, out)
+ text = out.read_text()
+ assert "extra value" in text # the leftover note survives serialization
+
+
+def test_gzipped_round_trip(yaml_file, tmp_path):
+ # A .yml.gz path is transparently gzipped on write and read.
+ model = read_yaml_model(yaml_file)
+ out = tmp_path / "out.yml.gz"
+ write_yaml_model(model, out)
+ assert out.read_bytes()[:2] == b"\x1f\x8b" # gzip magic
+ reloaded = read_yaml_model(out)
+ assert reloaded.id == "testModel"
+ assert {m.id for m in reloaded.metabolites} == {"s_0001", "s_0002"}
+
+
+def test_output_is_cobra_readable(yaml_file, tmp_path):
+ # The written file must load with stock cobra (it's cobra's native format).
+ model = read_yaml_model(yaml_file)
+ out = tmp_path / "out.yml"
+ write_yaml_model(model, out)
+ cobra_model = cobra.io.load_yaml_model(str(out))
+ assert cobra_model.id == "testModel"
+ assert {m.id for m in cobra_model.metabolites} == {"s_0001", "s_0002"}
+ # RAVEN-only fields land in cobra notes; smiles in annotation
+ assert cobra_model.metabolites.get_by_id("s_0001").annotation["smiles"] == ["C1=NC2"]
+
+
+def test_write_emits_raven_top_level_keys(yaml_file, tmp_path):
+ model = read_yaml_model(yaml_file)
+ out = tmp_path / "out.yml"
+ write_yaml_model(model, out)
+ text = out.read_text()
+ # RAVEN-only fields are lifted back to top-level entry keys, not buried in notes
+ assert "inchis:" in text
+ assert "deltaG:" in text
+ assert "confidence_score:" in text
+ assert "metaData:" in text
+
+
+def test_legacy_id_in_metadata(tmp_path):
+ # Older RAVEN files nest id/name under metaData and have no top-level id.
+ legacy = {
+ "metabolites": [{"id": "a_c", "name": "A", "compartment": "c"}],
+ "reactions": [],
+ "genes": [],
+ "compartments": {"c": "cyt"},
+ "metaData": {"id": "legacyModel", "name": "Legacy"},
+ }
+ p = tmp_path / "legacy.yml"
+ with open(p, "w", encoding="utf-8") as fh:
+ cobra_yaml.dump(legacy, fh)
+ model = read_yaml_model(p)
+ assert model.id == "legacyModel"
+ assert model.name == "Legacy"
+
+
+# Optional smoke test against a real model file if present.
+_YEAST = Path("/home/eduardk/github/GECKO/tutorials/full_ecModel/models/yeast-GEM.yml")
+
+
+@pytest.mark.skipif(not _YEAST.exists(), reason="real yeast-GEM.yml not available")
+def test_real_yeast_gem_loads():
+ model = read_yaml_model(_YEAST)
+ assert len(model.reactions) > 1000
+ # legacy file: identity comes from metaData
+ assert model.id
diff --git a/tests/test_manipulation_add.py b/tests/test_manipulation_add.py
new file mode 100644
index 0000000..2a3a9d3
--- /dev/null
+++ b/tests/test_manipulation_add.py
@@ -0,0 +1,278 @@
+"""Tests for raven_python.manipulation.add (addRxns port)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import add_reactions_from_equations
+from raven_python.utils.parse import parse_name_comp
+
+
+@pytest.fixture
+def model():
+ m = cobra.Model("t")
+ m.add_metabolites(
+ [
+ cobra.Metabolite("atp_c", name="ATP", compartment="c"),
+ cobra.Metabolite("h2o_c", name="H2O", compartment="c"),
+ cobra.Metabolite("adp_c", name="ADP", compartment="c"),
+ cobra.Metabolite("pi_c", name="phosphate", compartment="c"),
+ ]
+ )
+ return m
+
+
+# --- parse_name_comp -------------------------------------------------------
+
+@pytest.mark.parametrize(
+ "token,expected",
+ [
+ ("ATP[c]", ("ATP", "c")),
+ ("ATP", ("ATP", None)),
+ (" ATP[c] ", ("ATP", "c")),
+ ("weird[name][m]", ("weird[name]", "m")),
+ ],
+)
+def test_parse_name_comp(token, expected):
+ assert parse_name_comp(token) == expected
+
+
+# --- id mode (eqnType 1) ---------------------------------------------------
+
+def test_add_by_id_basic_and_reversibility(model):
+ (rxn,) = add_reactions_from_equations(
+ model, [{"id": "R1", "equation": "atp_c + h2o_c <=> adp_c + pi_c"}]
+ )
+ assert rxn.id == "R1"
+ assert rxn.reversibility is True
+ assert {m.id: rxn.get_coefficient(m.id) for m in rxn.metabolites} == {
+ "atp_c": -1.0,
+ "h2o_c": -1.0,
+ "adp_c": 1.0,
+ "pi_c": 1.0,
+ }
+
+
+def test_irreversible_arrows(model):
+ rxns = add_reactions_from_equations(
+ model,
+ [
+ {"id": "R1", "equation": "atp_c --> adp_c"},
+ {"id": "R2", "equation": "atp_c => adp_c"},
+ ],
+ )
+ for r in rxns:
+ assert r.lower_bound == 0.0
+ assert r.reversibility is False
+
+
+def test_coefficients(model):
+ (rxn,) = add_reactions_from_equations(
+ model, [{"id": "R1", "equation": "2 atp_c + 1.5 h2o_c --> adp_c"}]
+ )
+ assert rxn.get_coefficient("atp_c") == -2.0
+ assert rxn.get_coefficient("h2o_c") == -1.5
+
+
+def test_id_mode_creates_new_met_in_compartment(model):
+ add_reactions_from_equations(
+ model,
+ [{"id": "R1", "equation": "atp_c --> amp_c"}],
+ compartment="c",
+ )
+ assert "amp_c" in model.metabolites
+ assert model.metabolites.get_by_id("amp_c").compartment == "c"
+
+
+def test_id_mode_new_met_without_compartment_errors(model):
+ with pytest.raises(ValueError, match="no compartment"):
+ add_reactions_from_equations(model, [{"id": "R1", "equation": "atp_c --> amp_c"}])
+
+
+# --- name mode (eqnType 2) -------------------------------------------------
+
+def test_name_mode_matches_existing_by_name(model):
+ (rxn,) = add_reactions_from_equations(
+ model,
+ [{"id": "R1", "equation": "ATP + H2O <=> ADP + phosphate"}],
+ mets_by="name",
+ compartment="c",
+ )
+ # resolved to the existing _c metabolites, not new ones
+ assert {m.id for m in rxn.metabolites} == {"atp_c", "h2o_c", "adp_c", "pi_c"}
+ assert len(model.metabolites) == 4
+
+
+def test_name_mode_creates_new_met_with_auto_id(model):
+ add_reactions_from_equations(
+ model,
+ [{"id": "R1", "equation": "ATP --> AMP"}],
+ mets_by="name",
+ compartment="c",
+ )
+ new = [m for m in model.metabolites if m.name == "AMP"]
+ assert len(new) == 1
+ assert new[0].id == "m1"
+ assert new[0].compartment == "c"
+
+
+def test_name_mode_requires_compartment(model):
+ with pytest.raises(ValueError, match="needs a compartment"):
+ add_reactions_from_equations(
+ model, [{"id": "R1", "equation": "ATP --> ADP"}], mets_by="name"
+ )
+
+
+# --- name[comp] mode (eqnType 3) -------------------------------------------
+
+def test_name_comp_syntax(model):
+ model.add_metabolites([cobra.Metabolite("atp_m", name="ATP", compartment="m")])
+ (rxn,) = add_reactions_from_equations(
+ model,
+ [{"id": "R1", "equation": "ATP[c] --> ATP[m]"}],
+ mets_by="name",
+ compartment="c",
+ )
+ # matched ATP in two different compartments by name[comp]
+ assert {m.id for m in rxn.metabolites} == {"atp_c", "atp_m"}
+
+
+# --- genes -----------------------------------------------------------------
+
+def test_gene_rule_auto_creates_genes(model):
+ (rxn,) = add_reactions_from_equations(
+ model,
+ [{"id": "R1", "equation": "atp_c --> adp_c", "gene_reaction_rule": "G1 and G2"}],
+ )
+ assert {g.id for g in rxn.genes} == {"G1", "G2"}
+ assert {g.id for g in model.genes} == {"G1", "G2"}
+
+
+def test_strict_genes_errors_on_unknown(model):
+ with pytest.raises(ValueError, match="genes not in the model"):
+ add_reactions_from_equations(
+ model,
+ [{"id": "R1", "equation": "atp_c --> adp_c", "gene_reaction_rule": "G1"}],
+ allow_new_genes=False,
+ )
+
+
+def test_strict_genes_ok_when_present(model):
+ model.genes.append(cobra.core.gene.Gene("G1"))
+ (rxn,) = add_reactions_from_equations(
+ model,
+ [{"id": "R1", "equation": "atp_c --> adp_c", "gene_reaction_rule": "G1"}],
+ allow_new_genes=False,
+ )
+ assert rxn.gene_reaction_rule == "G1"
+
+
+# --- guards & extras -------------------------------------------------------
+
+def test_duplicate_reaction_id_errors(model):
+ model.add_reactions([cobra.Reaction("R1")])
+ with pytest.raises(ValueError, match="already exists"):
+ add_reactions_from_equations(model, [{"id": "R1", "equation": "atp_c --> adp_c"}])
+
+
+def test_strict_mets_errors(model):
+ with pytest.raises(ValueError, match="allow_new_mets"):
+ add_reactions_from_equations(
+ model,
+ [{"id": "R1", "equation": "atp_c --> amp_c"}],
+ compartment="c",
+ allow_new_mets=False,
+ )
+
+
+def test_explicit_bounds_override_arrow(model):
+ (rxn,) = add_reactions_from_equations(
+ model,
+ [{"id": "R1", "equation": "atp_c <=> adp_c", "bounds": (0, 50), "name": "myrxn"}],
+ )
+ assert rxn.bounds == (0, 50)
+ assert rxn.name == "myrxn"
+
+
+def test_net_zero_metabolite_dropped(model):
+ # atp_c on both sides nets to zero and is removed.
+ (rxn,) = add_reactions_from_equations(
+ model, [{"id": "R1", "equation": "atp_c + h2o_c --> atp_c + adp_c"}]
+ )
+ assert "atp_c" not in {m.id for m in rxn.metabolites}
+ assert {m.id for m in rxn.metabolites} == {"h2o_c", "adp_c"}
+
+
+def test_missing_equation_errors(model):
+ with pytest.raises(ValueError, match="missing required 'equation'"):
+ add_reactions_from_equations(model, [{"id": "R1"}])
+
+
+def test_no_arrow_errors(model):
+ with pytest.raises(ValueError, match="No reaction arrow"):
+ add_reactions_from_equations(model, [{"id": "R1", "equation": "atp_c + h2o_c"}])
+
+
+# --- regression: leading-number metabolite name (known_issues.md A1) -------
+
+def test_name_mode_preserves_leading_number_name(model):
+ """A metabolite name that begins with a number isn't misparsed as a coefficient.
+
+ Before the fix the token ``"2 oxoglutarate"`` was parsed as ``(coeff=2, name="oxoglutarate")``
+ silently — corrupting the stoichiometry. The resolver now prefers the full
+ token when it matches an existing metabolite name.
+ """
+ model.add_metabolites([
+ cobra.Metabolite("akg_c", name="2 oxoglutarate", compartment="c"),
+ ])
+ (rxn,) = add_reactions_from_equations(
+ model,
+ [{"id": "R1", "equation": "ATP + 2 oxoglutarate --> ADP"}],
+ mets_by="name",
+ compartment="c",
+ )
+ assert rxn.get_coefficient("akg_c") == -1.0 # not -2.0
+ assert rxn.get_coefficient("atp_c") == -1.0
+
+
+def test_name_mode_coefficient_still_works_without_collision(model):
+ """If the full token doesn't match anything, fall back to coefficient split."""
+ (rxn,) = add_reactions_from_equations(
+ model,
+ [{"id": "R1", "equation": "2 ATP + H2O --> ADP + phosphate"}],
+ mets_by="name",
+ compartment="c",
+ )
+ assert rxn.get_coefficient("atp_c") == -2.0
+
+
+# --- regression: empty-stoichiometry warning (known_issues.md A2) ----------
+
+def test_empty_stoichiometry_warns(model):
+ """All-terms-cancel reaction warns instead of silently shipping an empty rxn."""
+ with pytest.warns(UserWarning, match="no net metabolites"):
+ (rxn,) = add_reactions_from_equations(
+ model, [{"id": "R1", "equation": "atp_c --> atp_c"}]
+ )
+ assert len(rxn.metabolites) == 0
+
+
+# --- regression: unknown-compartment warning (known_issues.md B2) ----------
+
+def test_id_mode_unknown_compartment_warns(model):
+ """A typo'd compartment used to silently produce a one-met ghost compartment
+ in id mode (the name/[comp] path used to validate, id mode never did)."""
+ with pytest.warns(UserWarning, match="unregistered compartment 'cyto'"):
+ add_reactions_from_equations(
+ model,
+ [{"id": "R1", "equation": "atp_c --> amp_c"}],
+ compartment="cyto", # typo for 'c'
+ )
+
+
+def test_name_comp_unknown_compartment_warns(model):
+ """Same defensive check in the name[comp] path when allow_new_mets=True."""
+ with pytest.warns(UserWarning, match="unregistered compartment 'mito'"):
+ add_reactions_from_equations(
+ model,
+ [{"id": "R1", "equation": "ATP[c] --> AMP[mito]"}],
+ mets_by="name",
+ )
diff --git a/tests/test_manipulation_change.py b/tests/test_manipulation_change.py
new file mode 100644
index 0000000..8d54f58
--- /dev/null
+++ b/tests/test_manipulation_change.py
@@ -0,0 +1,93 @@
+"""Tests for raven_python.manipulation.change (changeRxns port)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import add_reactions_from_equations, change_reaction_equations
+
+
+@pytest.fixture
+def model():
+ m = cobra.Model("t")
+ m.add_metabolites(
+ [
+ cobra.Metabolite("a_c", name="A", compartment="c"),
+ cobra.Metabolite("b_c", name="B", compartment="c"),
+ cobra.Metabolite("c_c", name="C", compartment="c"),
+ ]
+ )
+ add_reactions_from_equations(
+ m,
+ [
+ {
+ "id": "R1",
+ "equation": "a_c <=> b_c",
+ "name": "first",
+ "bounds": (-30, 70),
+ "gene_reaction_rule": "G1 or G2",
+ "subsystem": "sub",
+ },
+ {"id": "R2", "equation": "a_c --> c_c"},
+ ],
+ )
+ return m
+
+
+def test_changes_stoichiometry(model):
+ (rxn,) = change_reaction_equations(model, {"R1": "a_c --> 2 c_c"})
+ assert rxn.id == "R1"
+ assert {m.id: rxn.get_coefficient(m.id) for m in rxn.metabolites} == {
+ "a_c": -1.0,
+ "c_c": 2.0,
+ }
+
+
+def test_preserves_other_fields(model):
+ before = model.reactions.get_by_id("R1")
+ name, bounds, subsystem = before.name, before.bounds, before.subsystem
+ genes = {g.id for g in before.genes}
+
+ change_reaction_equations(model, {"R1": "a_c --> c_c"})
+
+ after = model.reactions.get_by_id("R1")
+ assert after.name == name
+ assert after.bounds == bounds # bounds untouched, per RAVEN
+ assert after.subsystem == subsystem
+ assert {g.id for g in after.genes} == genes
+
+
+def test_preserves_reaction_order(model):
+ order_before = [r.id for r in model.reactions]
+ change_reaction_equations(model, {"R1": "b_c --> c_c"})
+ assert [r.id for r in model.reactions] == order_before
+
+
+def test_bounds_not_changed_by_arrow(model):
+ # R1 starts reversible (-30, 70); a --> arrow must NOT make it irreversible.
+ change_reaction_equations(model, {"R1": "a_c --> b_c"})
+ assert model.reactions.get_by_id("R1").bounds == (-30, 70)
+
+
+def test_name_mode(model):
+ (rxn,) = change_reaction_equations(
+ model, {"R2": "A --> C"}, mets_by="name", compartment="c"
+ )
+ assert {m.id for m in rxn.metabolites} == {"a_c", "c_c"}
+
+
+def test_can_introduce_new_met(model):
+ change_reaction_equations(
+ model, {"R2": "a_c --> d_c"}, compartment="c"
+ )
+ assert "d_c" in model.metabolites
+ assert model.reactions.get_by_id("R2").get_coefficient("d_c") == 1.0
+
+
+def test_unknown_reaction_errors(model):
+ with pytest.raises(ValueError, match="not found"):
+ change_reaction_equations(model, {"NOPE": "a_c --> b_c"})
+
+
+def test_multiple_reactions(model):
+ changed = change_reaction_equations(model, {"R1": "a_c --> c_c", "R2": "b_c --> c_c"})
+ assert [r.id for r in changed] == ["R1", "R2"]
+ assert model.reactions.get_by_id("R2").get_coefficient("b_c") == -1.0
diff --git a/tests/test_manipulation_compartments.py b/tests/test_manipulation_compartments.py
new file mode 100644
index 0000000..4d3fb3b
--- /dev/null
+++ b/tests/test_manipulation_compartments.py
@@ -0,0 +1,139 @@
+"""Tests for manipulation/compartments.py — merge_compartments + copy_to_compartment."""
+from __future__ import annotations
+
+import cobra
+import pytest
+
+from raven_python.manipulation.compartments import copy_to_compartment, merge_compartments
+
+
+def _two_compartment_model() -> cobra.Model:
+ """A_c → B_c, A_m → B_m, and a transport A_c ↔ A_m. Multi-compartment toy."""
+ m = cobra.Model("toy")
+ A_c = cobra.Metabolite("A_c", name="A", compartment="c")
+ A_m = cobra.Metabolite("A_m", name="A", compartment="m")
+ B_c = cobra.Metabolite("B_c", name="B", compartment="c")
+ B_m = cobra.Metabolite("B_m", name="B", compartment="m")
+ m.add_metabolites([A_c, A_m, B_c, B_m])
+
+ def rxn(rid, lb, ub, mets, gpr=None):
+ r = cobra.Reaction(rid, lower_bound=lb, upper_bound=ub)
+ r.add_metabolites(mets)
+ if gpr:
+ r.gene_reaction_rule = gpr
+ return r
+ m.add_reactions([rxn("r_c", 0, 1000, {A_c: -1, B_c: 1}, "g1"),
+ rxn("r_m", 0, 1000, {A_m: -1, B_m: 1}, "g2"),
+ rxn("tr_A", -1000, 1000, {A_c: -1, A_m: 1})])
+ return m
+
+
+# ----------------------------------------------------------------- merge_compartments
+
+def test_merge_compartments_collapses_to_one():
+ """A_c + A_m → A; B_c + B_m → B; transport A_c↔A_m self-cancels and is dropped."""
+ m = _two_compartment_model()
+ merged, deleted, dupes = merge_compartments(m)
+ # Only the base ids survive.
+ assert {x.id for x in merged.metabolites} == {"A", "B"}
+ # The transport reaction collapsed (A → A) and was deleted.
+ assert "tr_A" in deleted
+ # r_c and r_m are now both A → B; one of them gets deduplicated.
+ surviving = {r.id for r in merged.reactions}
+ assert len(surviving & {"r_c", "r_m"}) == 1
+ assert (set(dupes) | (surviving & {"r_c", "r_m"})) == {"r_c", "r_m"}
+
+
+def test_merge_compartments_preserves_gpr_and_subsystem():
+ m = _two_compartment_model()
+ m.reactions.r_c.subsystem = "carbo"
+ merged, _, _ = merge_compartments(m)
+ survivor = next(r for r in merged.reactions if r.id in {"r_c", "r_m"})
+ # The survivor keeps its gene rule + subsystem (cobra may sometimes lose them
+ # through copy; we set them explicitly).
+ assert survivor.gene_reaction_rule in {"g1", "g2"}
+ if survivor.id == "r_c":
+ assert survivor.subsystem == "carbo"
+
+
+def test_merge_compartments_keeps_single_met_reactions_when_asked():
+ """drop_single_metabolite_reactions=False keeps the collapsed transport (now A → A,
+ which is empty stoichiometry after net-cancellation — still dropped, but the *one-met*
+ case is the more interesting one). Use a uniport pattern to exercise it."""
+ m = cobra.Model("uniport")
+ A_c = cobra.Metabolite("A_c", name="A", compartment="c")
+ A_m = cobra.Metabolite("A_m", name="A", compartment="m")
+ H_c = cobra.Metabolite("H_c", name="H", compartment="c")
+ m.add_metabolites([A_c, A_m, H_c])
+ # H+ symport: A_c + H_c → A_m. After merge: A + H → A → leaves H.
+ sym = cobra.Reaction("sym", lower_bound=0, upper_bound=1000)
+ sym.add_metabolites({A_c: -1, H_c: -1, A_m: 1})
+ m.add_reactions([sym])
+ merged_drop, deleted_drop, _ = merge_compartments(m, drop_single_metabolite_reactions=True)
+ assert "sym" in deleted_drop
+ merged_keep, deleted_keep, _ = merge_compartments(m, drop_single_metabolite_reactions=False)
+ # With keep, sym survives as a one-met reaction (consumes H).
+ assert "sym" not in deleted_keep
+ assert "sym" in {r.id for r in merged_keep.reactions}
+
+
+def test_merge_compartments_deduplicate_off_keeps_both():
+ m = _two_compartment_model()
+ merged, _, dupes = merge_compartments(m, deduplicate_reactions=False)
+ assert dupes == []
+ assert {"r_c", "r_m"} <= {r.id for r in merged.reactions}
+
+
+# ----------------------------------------------------------------- copy_to_compartment
+
+def test_copy_to_compartment_basic():
+ """Copy r_c into 'p' (peroxisome): a new reaction r_c_p with metabolites in p."""
+ m = _two_compartment_model()
+ out, new_rxns, new_mets = copy_to_compartment(m, ["r_c"], "p",
+ target_compartment_name="peroxisome")
+ assert "r_c_p" in [r.id for r in out.reactions]
+ new_r = out.reactions.r_c_p
+ assert {x.compartment for x in new_r.metabolites} == {"p"}
+ assert "A_p" in [x.id for x in out.metabolites]
+ assert "B_p" in [x.id for x in out.metabolites]
+ assert new_rxns == ["r_c_p"]
+ assert set(new_mets) == {"A_p", "B_p"}
+ # Original still there.
+ assert "r_c" in [r.id for r in out.reactions]
+
+
+def test_copy_to_compartment_preserves_gpr_and_bounds():
+ m = _two_compartment_model()
+ out, _, _ = copy_to_compartment(m, ["r_c"], "p")
+ new_r = out.reactions.r_c_p
+ assert new_r.gene_reaction_rule == "g1"
+ assert new_r.lower_bound == 0 and new_r.upper_bound == 1000
+
+
+def test_copy_to_compartment_delete_original_is_a_move():
+ m = _two_compartment_model()
+ out, _, _ = copy_to_compartment(m, ["r_c"], "p", delete_original=True)
+ assert "r_c" not in [r.id for r in out.reactions]
+ assert "r_c_p" in [r.id for r in out.reactions]
+
+
+def test_copy_to_compartment_idempotent():
+ """Calling twice doesn't add the reaction twice."""
+ m = _two_compartment_model()
+ out, _, _ = copy_to_compartment(m, ["r_c"], "p")
+ out2, new_rxns, _ = copy_to_compartment(out, ["r_c"], "p")
+ assert new_rxns == [] # nothing added on second call
+ assert len([r for r in out2.reactions if r.id == "r_c_p"]) == 1
+
+
+def test_copy_to_compartment_unknown_reaction_raises():
+ m = _two_compartment_model()
+ with pytest.raises(ValueError, match="not in model"):
+ copy_to_compartment(m, ["does_not_exist"], "p")
+
+
+def test_copy_to_compartment_custom_suffix():
+ m = _two_compartment_model()
+ out, new_rxns, _ = copy_to_compartment(m, ["r_c"], "p", id_suffix="copy1")
+ assert new_rxns == ["r_c_copy1"]
+ assert "A_copy1" in [x.id for x in out.metabolites]
diff --git a/tests/test_manipulation_expand.py b/tests/test_manipulation_expand.py
new file mode 100644
index 0000000..08cd2f2
--- /dev/null
+++ b/tests/test_manipulation_expand.py
@@ -0,0 +1,288 @@
+"""Tests for expand_model (RAVEN expandModel.m) — splitting isozymes into reactions.
+
+Adopted from geckopy's tests/test_expand.py.
+"""
+import cobra
+
+from raven_python.manipulation import expand_model
+from raven_python.manipulation.expand import _gpr_to_dnf
+
+# --------------------------------------------------------------------------- #
+# DNF conversion (internal helper, worth testing directly)
+# --------------------------------------------------------------------------- #
+
+def _dnf_from_gpr_string(gpr_str: str) -> list[list[str]]:
+ from cobra.core.gene import GPR
+
+ gpr = GPR.from_string(gpr_str)
+ return _gpr_to_dnf(gpr)
+
+
+def test_dnf_empty_gpr():
+ assert _dnf_from_gpr_string("") == []
+
+
+def test_dnf_single_gene():
+ assert _dnf_from_gpr_string("g1") == [["g1"]]
+
+
+def test_dnf_simple_and():
+ assert _dnf_from_gpr_string("g1 and g2") == [["g1", "g2"]]
+
+
+def test_dnf_simple_or():
+ assert _dnf_from_gpr_string("g1 or g2") == [["g1"], ["g2"]]
+
+
+def test_dnf_or_of_ands():
+ assert _dnf_from_gpr_string("(g1 and g2) or (g3 and g4)") == [
+ ["g1", "g2"],
+ ["g3", "g4"],
+ ]
+
+
+def test_dnf_distributes_and_over_or():
+ result = _dnf_from_gpr_string("g1 and (g2 or g3)")
+ assert result == [["g1", "g2"], ["g1", "g3"]]
+
+
+def test_dnf_triple_or():
+ assert _dnf_from_gpr_string("g1 or g2 or g3") == [
+ ["g1"], ["g2"], ["g3"],
+ ]
+
+
+def test_dnf_preserves_gene_order_within_clause():
+ result = _dnf_from_gpr_string("g3 and g1 and g2")
+ assert result == [["g3", "g1", "g2"]]
+
+
+# --------------------------------------------------------------------------- #
+# expand_model
+# --------------------------------------------------------------------------- #
+
+def _build_model(
+ reactions: list[tuple[str, dict[str, float], float, float, str]],
+) -> cobra.Model:
+ """Build from (rxn_id, {met_id: coef}, lb, ub, gpr) tuples."""
+ model = cobra.Model("test")
+ mets: dict[str, cobra.Metabolite] = {}
+ for _, stoich, _, _, _ in reactions:
+ for met_id in stoich:
+ if met_id not in mets:
+ mets[met_id] = cobra.Metabolite(met_id, compartment="c")
+
+ for rxn_id, stoich, lb, ub, gpr in reactions:
+ rxn = cobra.Reaction(rxn_id)
+ rxn.lower_bound = lb
+ rxn.upper_bound = ub
+ rxn.add_metabolites({mets[m]: c for m, c in stoich.items()})
+ if gpr:
+ rxn.gene_reaction_rule = gpr
+ model.add_reactions([rxn])
+ return model
+
+
+def test_does_not_expand_reaction_without_gpr():
+ model = _build_model([("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "")])
+ added = expand_model(model)
+ assert added == []
+ assert "r1" in {r.id for r in model.reactions}
+
+
+def test_does_not_expand_single_and_clause():
+ model = _build_model([
+ ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 and g2"),
+ ])
+ added = expand_model(model)
+ assert added == []
+ r1 = model.reactions.get_by_id("r1")
+ assert r1.gene_reaction_rule == "g1 and g2"
+
+
+def test_does_not_expand_single_gene():
+ model = _build_model([("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1")])
+ added = expand_model(model)
+ assert added == []
+ assert model.reactions.get_by_id("r1").gene_reaction_rule == "g1"
+
+
+def test_splits_simple_or_into_two_reactions():
+ model = _build_model([
+ ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"),
+ ])
+ added = expand_model(model)
+
+ assert added == ["r1_EXP_1", "r1_EXP_2"]
+ rxn_ids = {r.id for r in model.reactions}
+ assert "r1" not in rxn_ids
+ assert "r1_EXP_1" in rxn_ids
+ assert "r1_EXP_2" in rxn_ids
+
+ assert model.reactions.get_by_id("r1_EXP_1").gene_reaction_rule == "g1"
+ assert model.reactions.get_by_id("r1_EXP_2").gene_reaction_rule == "g2"
+
+
+def test_splits_or_of_ands():
+ model = _build_model([
+ ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0,
+ "(g1 and g2) or (g3 and g4)"),
+ ])
+ added = expand_model(model)
+
+ assert added == ["r1_EXP_1", "r1_EXP_2"]
+ assert model.reactions.get_by_id("r1_EXP_1").gene_reaction_rule == "g1 and g2"
+ assert model.reactions.get_by_id("r1_EXP_2").gene_reaction_rule == "g3 and g4"
+
+
+def test_distributes_and_over_or():
+ model = _build_model([
+ ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0,
+ "g1 and (g2 or g3)"),
+ ])
+ added = expand_model(model)
+
+ assert added == ["r1_EXP_1", "r1_EXP_2"]
+ assert model.reactions.get_by_id("r1_EXP_1").gene_reaction_rule == "g1 and g2"
+ assert model.reactions.get_by_id("r1_EXP_2").gene_reaction_rule == "g1 and g3"
+
+
+def test_expanded_reactions_inherit_stoichiometry_and_bounds():
+ model = _build_model([
+ ("r1", {"A": -1.0, "B": 2.0}, -500.0, 1500.0, "g1 or g2"),
+ ])
+ expand_model(model)
+
+ for suffix in ("_EXP_1", "_EXP_2"):
+ rxn = model.reactions.get_by_id(f"r1{suffix}")
+ assert rxn.bounds == (-500.0, 1500.0)
+ stoich = {m.id: c for m, c in rxn.metabolites.items()}
+ assert stoich == {"A": -1.0, "B": 2.0}
+
+
+def test_expanded_reactions_inherit_name_and_subsystem():
+ model = _build_model([
+ ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"),
+ ])
+ r1 = model.reactions.get_by_id("r1")
+ r1.name = "an isozyme-catalyzed reaction"
+ r1.subsystem = "central metabolism"
+
+ expand_model(model)
+
+ for suffix in ("_EXP_1", "_EXP_2"):
+ rxn = model.reactions.get_by_id(f"r1{suffix}")
+ assert rxn.name == "an isozyme-catalyzed reaction"
+ assert rxn.subsystem == "central metabolism"
+
+
+def test_multiple_reactions_expand_independently():
+ model = _build_model([
+ ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"),
+ ("r2", {"B": -1.0, "C": 1.0}, 0.0, 1000.0, "g3 and g4"),
+ ("r3", {"C": -1.0, "D": 1.0}, 0.0, 1000.0,
+ "(g5 and g6) or g7 or (g8 and g9)"),
+ ])
+ added = expand_model(model)
+
+ assert added == sorted([
+ "r1_EXP_1", "r1_EXP_2",
+ "r3_EXP_1", "r3_EXP_2", "r3_EXP_3",
+ ])
+
+ rxn_ids = {r.id for r in model.reactions}
+ assert "r2" in rxn_ids
+ assert "r1" not in rxn_ids
+ assert "r3" not in rxn_ids
+
+ assert model.reactions.get_by_id("r2").gene_reaction_rule == "g3 and g4"
+ assert model.reactions.get_by_id("r3_EXP_1").gene_reaction_rule == "g5 and g6"
+ assert model.reactions.get_by_id("r3_EXP_2").gene_reaction_rule == "g7"
+ assert model.reactions.get_by_id("r3_EXP_3").gene_reaction_rule == "g8 and g9"
+
+
+def test_expanded_reaction_has_correct_gene_set():
+ model = _build_model([
+ ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0,
+ "(g1 and g2) or (g3 and g4)"),
+ ])
+ expand_model(model)
+
+ r1_1 = model.reactions.get_by_id("r1_EXP_1")
+ assert {g.id for g in r1_1.genes} == {"g1", "g2"}
+
+ r1_2 = model.reactions.get_by_id("r1_EXP_2")
+ assert {g.id for g in r1_2.genes} == {"g3", "g4"}
+
+
+def test_expansion_is_idempotent_in_the_no_op_sense():
+ model = _build_model([
+ ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"),
+ ("r2", {"B": -1.0, "C": 1.0}, 0.0, 1000.0, "g3 and g4"),
+ ])
+ expand_model(model)
+ ids_before = {r.id for r in model.reactions}
+
+ second = expand_model(model)
+ assert second == []
+
+ ids_after = {r.id for r in model.reactions}
+ assert ids_after == ids_before
+
+
+def test_empty_model_is_unchanged():
+ model = cobra.Model("empty")
+ assert expand_model(model) == []
+
+
+# --------------------------------------------------------------------------- #
+# Annotation and notes propagation
+# --------------------------------------------------------------------------- #
+
+def test_expanded_reactions_inherit_annotation_and_notes():
+ model = _build_model([
+ ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"),
+ ])
+ r1 = model.reactions.get_by_id("r1")
+ r1.annotation["ec-code"] = "1.2.3.4"
+ r1.annotation["sbo"] = "SBO:0000176"
+ r1.notes["custom"] = "hello"
+
+ expand_model(model)
+
+ for suffix in ("_EXP_1", "_EXP_2"):
+ rxn = model.reactions.get_by_id(f"r1{suffix}")
+ assert rxn.annotation["ec-code"] == "1.2.3.4"
+ assert rxn.annotation["sbo"] == "SBO:0000176"
+ assert rxn.notes["custom"] == "hello"
+
+
+def test_expanded_reaction_annotation_is_independent_of_parent():
+ """Mutating one expanded reaction's annotation must not affect siblings."""
+ model = _build_model([
+ ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"),
+ ])
+ model.reactions.get_by_id("r1").annotation["ec-code"] = ["1.2.3.4"]
+
+ expand_model(model)
+
+ r1_1 = model.reactions.get_by_id("r1_EXP_1")
+ r1_2 = model.reactions.get_by_id("r1_EXP_2")
+ r1_1.annotation["ec-code"].append("9.9.9.9")
+ assert r1_2.annotation["ec-code"] == ["1.2.3.4"]
+
+
+def test_objective_coefficient_preserved_on_expansion():
+ """An expanded reaction's isozyme copies retain the original objective coefficient."""
+ m = cobra.Model("o")
+ a, b = (cobra.Metabolite(x, compartment="c") for x in "ab")
+ m.add_metabolites([a, b])
+ r = cobra.Reaction("r1", lower_bound=0, upper_bound=1000)
+ r.add_metabolites({a: -1, b: 1})
+ r.gene_reaction_rule = "g1 or g2"
+ m.add_reactions([r])
+ m.objective = "r1" # objective on the soon-to-be-expanded reaction
+
+ expand_model(m)
+ coeffs = {rx.id: rx.objective_coefficient for rx in m.reactions}
+ assert coeffs == {"r1_EXP_1": 1.0, "r1_EXP_2": 1.0} # objective survives on both copies
diff --git a/tests/test_manipulation_irreversible.py b/tests/test_manipulation_irreversible.py
new file mode 100644
index 0000000..e211fa3
--- /dev/null
+++ b/tests/test_manipulation_irreversible.py
@@ -0,0 +1,144 @@
+"""Tests for convert_to_irreversible (RAVEN convertToIrrev.m).
+
+Adopted from geckopy's tests/test_preprocess.py (the convert_to_irreversible subset).
+Exchange reactions are excluded from the split, matching MATLAB behavior.
+"""
+import cobra
+
+from raven_python.manipulation import convert_to_irreversible
+
+
+def _build_model_with_bounds(
+ reactions: list[tuple[str, dict[str, float], float, float]],
+) -> cobra.Model:
+ """Build from (rxn_id, {met_id: coef}, lb, ub) tuples."""
+ model = cobra.Model("test")
+ mets: dict[str, cobra.Metabolite] = {}
+ for _, stoich, _, _ in reactions:
+ for met_id in stoich:
+ if met_id not in mets:
+ mets[met_id] = cobra.Metabolite(met_id, compartment="c")
+
+ for rxn_id, stoich, lb, ub in reactions:
+ rxn = cobra.Reaction(rxn_id)
+ rxn.lower_bound = lb
+ rxn.upper_bound = ub
+ rxn.add_metabolites({mets[m]: c for m, c in stoich.items()})
+ model.add_reactions([rxn])
+ return model
+
+
+def test_splits_single_reversible_non_exchange():
+ model = _build_model_with_bounds([
+ ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0),
+ ])
+
+ added = convert_to_irreversible(model)
+ assert added == ["r1_REV"]
+
+ fwd = model.reactions.get_by_id("r1")
+ rev = model.reactions.get_by_id("r1_REV")
+
+ assert fwd.bounds == (0.0, 1000.0)
+ assert {m.id: c for m, c in fwd.metabolites.items()} == {"A": -1.0, "B": 1.0}
+
+ assert rev.bounds == (0.0, 500.0)
+ assert {m.id: c for m, c in rev.metabolites.items()} == {"A": 1.0, "B": -1.0}
+
+
+def test_does_not_split_forward_only_reaction():
+ model = _build_model_with_bounds([
+ ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0),
+ ])
+ added = convert_to_irreversible(model)
+ assert added == []
+ assert "r1_REV" not in {r.id for r in model.reactions}
+
+
+def test_does_not_split_exchange_reaction_even_if_reversible():
+ """Exchange reactions (one metabolite) are explicitly excluded from
+ the irreversibility step in MATLAB, regardless of bounds."""
+ model = _build_model_with_bounds([
+ ("EX_A", {"A": -1.0}, -1000.0, 1000.0),
+ ])
+ added = convert_to_irreversible(model)
+ assert added == []
+ ex = model.reactions.get_by_id("EX_A")
+ assert ex.bounds == (-1000.0, 1000.0)
+
+
+def test_splits_multiple_mixed_reactions():
+ model = _build_model_with_bounds([
+ ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0), # split
+ ("r2", {"B": -2.0, "C": 3.0}, 0.0, 1000.0), # forward only
+ ("EX_A", {"A": -1.0}, -1000.0, 1000.0), # exchange
+ ("r3", {"C": -1.0, "D": 1.0}, -200.0, 200.0), # split
+ ])
+
+ added = convert_to_irreversible(model)
+ assert added == ["r1_REV", "r3_REV"]
+
+ assert model.reactions.get_by_id("r1").bounds == (0.0, 1000.0)
+ assert model.reactions.get_by_id("r1_REV").bounds == (0.0, 500.0)
+ assert model.reactions.get_by_id("r2").bounds == (0.0, 1000.0)
+ assert model.reactions.get_by_id("EX_A").bounds == (-1000.0, 1000.0)
+ assert model.reactions.get_by_id("r3").bounds == (0.0, 200.0)
+ assert model.reactions.get_by_id("r3_REV").bounds == (0.0, 200.0)
+
+
+def test_reverse_reaction_inherits_gpr():
+ model = _build_model_with_bounds([
+ ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0),
+ ])
+ model.reactions.get_by_id("r1").gene_reaction_rule = "g1 and g2"
+
+ convert_to_irreversible(model)
+
+ rev = model.reactions.get_by_id("r1_REV")
+ assert rev.gene_reaction_rule == "g1 and g2"
+ assert {g.id for g in rev.genes} == {"g1", "g2"}
+
+
+def test_forward_reaction_lb_is_clamped_to_zero():
+ """After splitting, the original reaction should have lb = 0,
+ which is what MATLAB's convertToIrrev does."""
+ model = _build_model_with_bounds([
+ ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0),
+ ])
+ convert_to_irreversible(model)
+ assert model.reactions.get_by_id("r1").lower_bound == 0.0
+
+
+def test_no_reverse_reaction_has_negative_bound():
+ """After conversion, no non-exchange reaction may carry negative flux."""
+ model = _build_model_with_bounds([
+ ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0),
+ ("r2", {"B": -1.0, "C": 1.0}, -1000.0, 0.0), # blocked reverse
+ ("EX_A", {"A": -1.0}, -1000.0, 1000.0),
+ ])
+ convert_to_irreversible(model)
+ for rxn in model.reactions:
+ if rxn.boundary:
+ continue
+ assert rxn.lower_bound >= 0, f"{rxn.id} still has lb < 0"
+
+
+def test_returns_empty_list_when_nothing_to_split():
+ model = _build_model_with_bounds([
+ ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0),
+ ("EX_A", {"A": -1.0}, -1000.0, 1000.0),
+ ])
+ assert convert_to_irreversible(model) == []
+
+
+def test_conversion_is_idempotent_after_first_pass():
+ """Running convert_to_irreversible twice should not create
+ `_REV_REV` reactions, because the first pass already clamped
+ all non-exchange lb to 0."""
+ model = _build_model_with_bounds([
+ ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0),
+ ])
+ convert_to_irreversible(model)
+ second = convert_to_irreversible(model)
+ assert second == []
+ assert "r1_REV_REV" not in {r.id for r in model.reactions}
diff --git a/tests/test_manipulation_merge.py b/tests/test_manipulation_merge.py
new file mode 100644
index 0000000..a430f6e
--- /dev/null
+++ b/tests/test_manipulation_merge.py
@@ -0,0 +1,136 @@
+"""Tests for merge_models (mergeModels port)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import add_reactions_from_equations, merge_models
+
+
+def _model(mid, mets, reactions):
+ m = cobra.Model(mid)
+ m.add_metabolites(mets)
+ add_reactions_from_equations(m, reactions)
+ return m
+
+
+@pytest.fixture
+def model_a():
+ return _model(
+ "A",
+ [
+ cobra.Metabolite("glc_c", name="Glucose", compartment="c"),
+ cobra.Metabolite("g6p_c", name="G6P", compartment="c"),
+ ],
+ [{"id": "HEX", "equation": "glc_c --> g6p_c", "gene_reaction_rule": "GA"}],
+ )
+
+
+@pytest.fixture
+def model_b():
+ # same Glucose[c] compound but a DIFFERENT id
+ return _model(
+ "B",
+ [
+ cobra.Metabolite("glucose_c", name="Glucose", compartment="c"),
+ cobra.Metabolite("lac_c", name="Lactate", compartment="c"),
+ ],
+ [{"id": "LDH", "equation": "glucose_c --> lac_c", "gene_reaction_rule": "GB"}],
+ )
+
+
+def test_unifies_metabolites_by_name_comp(model_a, model_b):
+ merged = merge_models([model_a, model_b])
+ glucoses = [m for m in merged.metabolites if m.name == "Glucose" and m.compartment == "c"]
+ assert len(glucoses) == 1 # glc_c and glucose_c unified
+ # both reactions reference the same merged Glucose object
+ hex_glc = [m for m in merged.reactions.get_by_id("HEX").metabolites if m.name == "Glucose"][0]
+ ldh_glc = [m for m in merged.reactions.get_by_id("LDH").metabolites if m.name == "Glucose"][0]
+ assert hex_glc is ldh_glc
+
+
+def test_match_by_id_keeps_distinct(model_a, model_b):
+ merged = merge_models([model_a, model_b], match_by="id")
+ glucoses = [m for m in merged.metabolites if m.name == "Glucose"]
+ assert len(glucoses) == 2 # glc_c and glucose_c are distinct by id
+
+
+def test_all_reactions_kept(model_a, model_b):
+ merged = merge_models([model_a, model_b])
+ assert {"HEX", "LDH"} <= {r.id for r in merged.reactions}
+
+
+def test_reaction_id_collision_renamed(model_a):
+ # two models with the same reaction id but different chemistry
+ other = _model(
+ "B",
+ [cobra.Metabolite("glc_c", name="Glucose", compartment="c"),
+ cobra.Metabolite("x_c", name="X", compartment="c")],
+ [{"id": "HEX", "equation": "glc_c --> x_c"}],
+ )
+ merged = merge_models([model_a, other])
+ assert "HEX" in {r.id for r in merged.reactions}
+ assert "HEX_B" in {r.id for r in merged.reactions} # renamed with source id
+
+
+def test_genes_merged(model_a, model_b):
+ merged = merge_models([model_a, model_b])
+ assert {"GA", "GB"} <= {g.id for g in merged.genes}
+
+
+def test_provenance_recorded(model_a, model_b):
+ merged = merge_models([model_a, model_b])
+ assert merged.reactions.get_by_id("HEX").notes["origin"] == "A"
+ assert merged.reactions.get_by_id("LDH").notes["origin"] == "B"
+ assert merged.genes.get_by_id("GA").notes["origin"] == "A"
+
+
+def test_compartments_preserved(model_a):
+ model_a.compartments = {"c": "cytoplasm"}
+ merged = merge_models([model_a, model_a.copy()])
+ assert merged.compartments.get("c") == "cytoplasm"
+
+
+def test_single_model_returns_copy(model_a):
+ merged = merge_models([model_a])
+ assert merged is not model_a
+ assert {r.id for r in merged.reactions} == {r.id for r in model_a.reactions}
+
+
+def test_three_models(model_a, model_b):
+ c = _model("C", [cobra.Metabolite("co2_c", name="CO2", compartment="c")],
+ [{"id": "SINK", "equation": "co2_c -->"}])
+ merged = merge_models([model_a, model_b, c])
+ assert {"HEX", "LDH", "SINK"} <= {r.id for r in merged.reactions}
+
+
+def test_bad_match_by(model_a, model_b):
+ with pytest.raises(ValueError, match="match_by"):
+ merge_models([model_a, model_b], match_by="oops")
+
+
+# --- regression: formula/charge conflict (known_issues.md B1) --------------
+
+def test_formula_conflict_warns():
+ """Two models sharing a name[comp] but with different formulas warn instead
+ of silently keeping the first."""
+ a = _model("A",
+ [cobra.Metabolite("g1", name="Glucose", formula="C6H12O6", compartment="c")],
+ [{"id": "EX_A", "equation": "g1 -->"}])
+ b = _model("B",
+ [cobra.Metabolite("g2", name="Glucose", formula="C6H12O7", compartment="c")],
+ [{"id": "EX_B", "equation": "g2 -->"}])
+ with pytest.warns(UserWarning, match="different formulas"):
+ merged = merge_models([a, b])
+ # The merge still picks the first-seen — the test asserts the warning fired
+ # and the model survives.
+ assert "EX_A" in merged.reactions and "EX_B" in merged.reactions
+
+
+def test_charge_conflict_warns():
+ a = _model("A",
+ [cobra.Metabolite("g1", name="Glucose", formula="C6H12O6", charge=0, compartment="c")],
+ [{"id": "EX_A", "equation": "g1 -->"}])
+ b = _model("B",
+ [cobra.Metabolite("g2", name="Glucose", formula="C6H12O6", charge=-1, compartment="c")],
+ [{"id": "EX_B", "equation": "g2 -->"}])
+ with pytest.warns(UserWarning, match="different charges"):
+ merge_models([a, b])
diff --git a/tests/test_manipulation_remove.py b/tests/test_manipulation_remove.py
new file mode 100644
index 0000000..2b659b9
--- /dev/null
+++ b/tests/test_manipulation_remove.py
@@ -0,0 +1,97 @@
+"""Tests for raven_python.manipulation.remove (removeMets/removeGenes ports)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import (
+ add_reactions_from_equations,
+ remove_genes,
+ remove_metabolites,
+)
+
+
+@pytest.fixture
+def model():
+ m = cobra.Model("t")
+ m.add_metabolites(
+ [
+ cobra.Metabolite("atp_c", name="ATP", compartment="c"),
+ cobra.Metabolite("atp_m", name="ATP", compartment="m"),
+ cobra.Metabolite("adp_c", name="ADP", compartment="c"),
+ cobra.Metabolite("x_c", name="X", compartment="c"),
+ ]
+ )
+ add_reactions_from_equations(
+ m,
+ [
+ {"id": "R1", "equation": "atp_c --> adp_c", "gene_reaction_rule": "G1 and G2"},
+ {"id": "R2", "equation": "atp_c --> x_c", "gene_reaction_rule": "G3 or G4"},
+ {"id": "R3", "equation": "atp_m --> adp_c"}, # no GPR (spontaneous)
+ ],
+ )
+ return m
+
+
+# --- remove_metabolites ----------------------------------------------------
+
+def test_remove_metabolites_by_id(model):
+ remove_metabolites(model, ["x_c"])
+ assert "x_c" not in model.metabolites
+ # reaction kept, just lost the metabolite
+ assert "R2" in model.reactions
+
+
+def test_remove_metabolites_by_name_across_compartments(model):
+ # "ATP" exists in c and m; by_name removes both at once.
+ remove_metabolites(model, ["ATP"], by_name=True)
+ assert "atp_c" not in model.metabolites
+ assert "atp_m" not in model.metabolites
+ assert "adp_c" in model.metabolites
+
+
+def test_remove_metabolites_destructive(model):
+ remove_metabolites(model, ["adp_c"], destructive=True)
+ # R1 and R3 both produced adp_c -> removed
+ assert "adp_c" not in model.metabolites
+ assert "R1" not in model.reactions and "R3" not in model.reactions
+
+
+# --- remove_genes ----------------------------------------------------------
+
+def test_remove_genes_remove_mode(model):
+ blocked = remove_genes(model, ["G1"], blocked_reactions="remove")
+ # R1 = "G1 and G2": removing G1 breaks the complex -> blocked -> removed
+ assert blocked == ["R1"]
+ assert "R1" not in model.reactions
+ assert "R2" in model.reactions # OR rule unaffected
+
+
+def test_remove_genes_constrain_mode(model):
+ blocked = remove_genes(model, ["G1"], blocked_reactions="constrain")
+ assert blocked == ["R1"]
+ r1 = model.reactions.get_by_id("R1")
+ assert r1.bounds == (0, 0) # kept but constrained, per RAVEN default
+ assert r1.gene_reaction_rule == ""
+
+
+def test_remove_genes_keep_mode(model):
+ blocked = remove_genes(model, ["G1"], blocked_reactions="keep")
+ assert blocked == ["R1"]
+ r1 = model.reactions.get_by_id("R1")
+ assert r1.gene_reaction_rule == ""
+ assert r1.bounds != (0, 0) # left untouched
+
+
+def test_remove_genes_or_rule_not_blocked(model):
+ blocked = remove_genes(model, ["G3"], blocked_reactions="remove")
+ # R2 = "G3 or G4": removing G3 leaves G4 -> not blocked
+ assert blocked == []
+ assert model.reactions.get_by_id("R2").gene_reaction_rule == "G4"
+
+
+def test_remove_genes_absent_gene_is_noop(model):
+ assert remove_genes(model, ["NOPE"]) == []
+
+
+def test_remove_genes_bad_policy(model):
+ with pytest.raises(ValueError, match="blocked_reactions"):
+ remove_genes(model, ["G1"], blocked_reactions="explode")
diff --git a/tests/test_manipulation_simplify.py b/tests/test_manipulation_simplify.py
new file mode 100644
index 0000000..586a0c3
--- /dev/null
+++ b/tests/test_manipulation_simplify.py
@@ -0,0 +1,184 @@
+"""Tests for simplifyModel reduction modes."""
+import cobra
+import pytest
+
+from raven_python.manipulation import (
+ add_reactions_from_equations,
+ constrain_reversible_reactions,
+ group_linear_reactions,
+ remove_dead_end_reactions,
+ remove_duplicate_reactions,
+)
+
+# --- remove_dead_end_reactions --------------------------------------------
+
+def test_dead_end_removed():
+ m = cobra.Model("t")
+ m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b", "dead")])
+ add_reactions_from_equations(
+ m,
+ [
+ {"id": "R_in", "equation": " --> a"},
+ {"id": "R1", "equation": "a --> b"},
+ {"id": "R_out", "equation": "b --> "},
+ {"id": "R_dead", "equation": "a --> dead"}, # 'dead' only produced
+ ],
+ )
+ removed_rxns, removed_mets = remove_dead_end_reactions(m)
+ assert "R_dead" in removed_rxns
+ assert "dead" in removed_mets
+ # the productive path survives
+ assert {"R_in", "R1", "R_out"} <= {r.id for r in m.reactions}
+
+
+def test_dead_end_respects_reserved():
+ m = cobra.Model("t")
+ m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "dead")])
+ add_reactions_from_equations(
+ m, [{"id": "R_in", "equation": " --> a"}, {"id": "R_dead", "equation": "a --> dead"}]
+ )
+ removed_rxns, _ = remove_dead_end_reactions(m, reserved=["R_dead"])
+ assert "R_dead" not in removed_rxns
+ assert "R_dead" in {r.id for r in m.reactions}
+
+
+# --- remove_duplicate_reactions -------------------------------------------
+
+def test_duplicates_removed():
+ m = cobra.Model("t")
+ m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b")])
+ add_reactions_from_equations(
+ m,
+ [
+ {"id": "R1", "equation": "a --> b", "bounds": (0, 1000)},
+ {"id": "R2", "equation": "a --> b", "bounds": (0, 1000)}, # duplicate of R1
+ {"id": "R3", "equation": "a --> b", "bounds": (0, 500)}, # different bounds
+ ],
+ )
+ removed = remove_duplicate_reactions(m)
+ assert len(removed) == 1 # one of R1/R2 removed
+ assert {"R3"} <= {r.id for r in m.reactions}
+ assert sum(r.id in ("R1", "R2") for r in m.reactions) == 1
+
+
+def test_duplicates_keep_reserved():
+ m = cobra.Model("t")
+ m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b")])
+ add_reactions_from_equations(
+ m,
+ [
+ {"id": "R1", "equation": "a --> b", "bounds": (0, 1000)},
+ {"id": "R2", "equation": "a --> b", "bounds": (0, 1000)},
+ ],
+ )
+ remove_duplicate_reactions(m, reserved=["R1"])
+ assert "R1" in {r.id for r in m.reactions} # reserved one kept
+
+
+# --- constrain_reversible_reactions ---------------------------------------
+
+def test_forward_only_reversible_constrained():
+ m = cobra.Model("t")
+ m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b")])
+ add_reactions_from_equations(
+ m,
+ [
+ {"id": "R_in", "equation": " --> a", "bounds": (0, 1000)},
+ {"id": "R1", "equation": "a <=> b", "bounds": (-1000, 1000)}, # can only go fwd
+ {"id": "R_out", "equation": "b --> ", "bounds": (0, 1000)},
+ ],
+ )
+ changed = constrain_reversible_reactions(m)
+ assert "R1" in changed
+ assert m.reactions.get_by_id("R1").lower_bound == 0 # constrained to forward
+
+
+def test_truly_reversible_unchanged():
+ m = cobra.Model("t")
+ m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b")])
+ add_reactions_from_equations(
+ m,
+ [
+ {"id": "R_in", "equation": " <=> a", "bounds": (-1000, 1000)},
+ {"id": "R1", "equation": "a <=> b", "bounds": (-1000, 1000)},
+ {"id": "R_out", "equation": "b <=> ", "bounds": (-1000, 1000)},
+ ],
+ )
+ changed = constrain_reversible_reactions(m)
+ assert "R1" not in changed # can go both ways
+
+
+# --- group_linear_reactions -----------------------------------------------
+
+def test_linear_chain_merged():
+ m = cobra.Model("t")
+ m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b", "c")])
+ add_reactions_from_equations(
+ m,
+ [
+ {"id": "R1", "equation": "a --> b"}, # b: single producer
+ {"id": "R2", "equation": "b --> c"}, # b: single consumer
+ ],
+ )
+ n_before = len(m.reactions)
+ group_linear_reactions(m)
+ # b is eliminated; R1+R2 merged into one reaction a --> c
+ assert "b" not in m.metabolites
+ assert len(m.reactions) < n_before
+
+
+def test_group_linear_discards_genes():
+ m = cobra.Model("t")
+ m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b", "c")])
+ add_reactions_from_equations(
+ m,
+ [
+ {"id": "R1", "equation": "a --> b", "gene_reaction_rule": "G1"},
+ {"id": "R2", "equation": "b --> c", "gene_reaction_rule": "G2"},
+ ],
+ )
+ group_linear_reactions(m)
+ assert len(m.genes) == 0
+
+
+# --- regression: incremental merge collapses a long chain (known_issues.md D1) ---
+
+def test_group_linear_merges_long_chain_in_one_pass():
+ """The incremental scan still flattens a 5-reaction linear chain — the
+ correctness property the original O(n²·m) restart-after-merge loop had."""
+ m = cobra.Model("t")
+ m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in "abcdef"])
+ add_reactions_from_equations(
+ m,
+ [
+ {"id": "R_in", "equation": " --> a"},
+ {"id": "R1", "equation": "a --> b"},
+ {"id": "R2", "equation": "b --> c"},
+ {"id": "R3", "equation": "c --> d"},
+ {"id": "R4", "equation": "d --> e"},
+ {"id": "R5", "equation": "e --> f"},
+ {"id": "R_out", "equation": "f --> "},
+ ],
+ )
+ group_linear_reactions(m)
+ # All the chain's internal metabolites are gone.
+ assert {x for x in m.metabolites if x.id in {"b", "c", "d", "e"}} == set()
+
+
+# --- regression: NaN FVA on infeasible model (known_issues.md C1) ----------
+
+def test_constrain_reversible_raises_on_infeasible():
+ """An infeasible model produces NaN FVA ranges; the old abs(NaN) < eps
+ check silently treated those as 'truly reversible'. Now raises."""
+ m = cobra.Model("t")
+ a, b = (cobra.Metabolite(x, compartment="c") for x in ("a", "b"))
+ m.add_metabolites([a, b])
+ # Force a contradiction: r requires production AND consumption of a, but
+ # nothing else produces a.
+ r = cobra.Reaction("r", lower_bound=-1, upper_bound=1)
+ r.add_metabolites({a: -1, b: 1})
+ forced = cobra.Reaction("forced", lower_bound=5, upper_bound=10) # infeasible
+ forced.add_metabolites({a: -1})
+ m.add_reactions([r, forced])
+ with pytest.raises(RuntimeError, match="infeasible"):
+ constrain_reversible_reactions(m)
diff --git a/tests/test_manipulation_transfer.py b/tests/test_manipulation_transfer.py
new file mode 100644
index 0000000..61c2ac9
--- /dev/null
+++ b/tests/test_manipulation_transfer.py
@@ -0,0 +1,137 @@
+"""Tests for add_reactions_from_model (addRxnsGenesMets port)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import add_reactions_from_equations, add_reactions_from_model
+
+
+@pytest.fixture
+def draft():
+ m = cobra.Model("draft")
+ m.add_metabolites(
+ [cobra.Metabolite("glc_c", name="Glucose", formula="C6H12O6", compartment="c")]
+ )
+ # an existing reaction so glc_c is in use and we have an id to test skipping
+ add_reactions_from_equations(m, [{"id": "R_existing", "equation": "glc_c <=>"}])
+ return m
+
+
+@pytest.fixture
+def source():
+ m = cobra.Model("source")
+ m.add_metabolites(
+ [
+ # same name[comp] as draft's glc_c but a DIFFERENT id
+ cobra.Metabolite("glucose_c", name="Glucose", formula="C6H12O6", compartment="c"),
+ cobra.Metabolite("atp_c", name="ATP", formula="C10H16N5O13P3", charge=-4, compartment="c"),
+ cobra.Metabolite("g6p_c", name="G6P", formula="C6H13O9P", compartment="c"),
+ ]
+ )
+ add_reactions_from_equations(
+ m,
+ [
+ {
+ "id": "HEX",
+ "equation": "glucose_c + atp_c --> g6p_c",
+ "name": "hexokinase",
+ "bounds": (0, 1000),
+ "gene_reaction_rule": "G1",
+ "subsystem": "glycolysis",
+ },
+ {"id": "R_existing", "equation": "glucose_c <=>"}, # id already in draft
+ ],
+ )
+ return m
+
+
+def test_metabolite_matched_by_name_comp_not_id(draft, source):
+ add_reactions_from_model(draft, source, "HEX")
+ hex_rxn = draft.reactions.get_by_id("HEX")
+ # Glucose reused from the draft (id glc_c), NOT the source's glucose_c
+ assert "glc_c" in {m.id for m in hex_rxn.metabolites}
+ assert "glucose_c" not in draft.metabolites
+
+
+def test_new_metabolites_added_with_metadata(draft, source):
+ add_reactions_from_model(draft, source, "HEX")
+ assert "atp_c" in draft.metabolites and "g6p_c" in draft.metabolites
+ assert draft.metabolites.get_by_id("g6p_c").formula == "C6H13O9P"
+ assert draft.metabolites.get_by_id("atp_c").charge == -4
+
+
+def test_reaction_copied_with_bounds_and_name(draft, source):
+ (rxn,) = add_reactions_from_model(draft, source, "HEX")
+ assert rxn.id == "HEX"
+ assert rxn.name == "hexokinase"
+ assert rxn.bounds == (0, 1000)
+ assert rxn.subsystem == "glycolysis"
+ assert {m.id: rxn.get_coefficient(m.id) for m in rxn.metabolites} == {
+ "glc_c": -1.0,
+ "atp_c": -1.0,
+ "g6p_c": 1.0,
+ }
+
+
+def test_genes_true_copies_gpr_and_creates_genes(draft, source):
+ add_reactions_from_model(draft, source, "HEX", genes=True)
+ assert draft.reactions.get_by_id("HEX").gene_reaction_rule == "G1"
+ assert "G1" in draft.genes
+
+
+def test_genes_false_no_gpr(draft, source):
+ add_reactions_from_model(draft, source, "HEX", genes=False)
+ assert draft.reactions.get_by_id("HEX").gene_reaction_rule == ""
+
+
+def test_genes_string_override(draft, source):
+ add_reactions_from_model(draft, source, "HEX", genes="G9 or G10")
+ assert draft.reactions.get_by_id("HEX").gene_reaction_rule == "G9 or G10"
+
+
+def test_skips_already_present(draft, source):
+ added = add_reactions_from_model(draft, source, ["HEX", "R_existing"])
+ assert [r.id for r in added] == ["HEX"]
+
+
+def test_all_present_raises(draft, source):
+ with pytest.raises(ValueError, match="already in the model"):
+ add_reactions_from_model(draft, source, "R_existing")
+
+
+def test_unknown_source_reaction_raises(draft, source):
+ with pytest.raises(ValueError, match="not found in the source model"):
+ add_reactions_from_model(draft, source, "NOPE")
+
+
+def test_note_and_confidence_stored(draft, source):
+ (rxn,) = add_reactions_from_model(draft, source, "HEX", note="from KEGG", confidence=2)
+ assert rxn.notes["note"] == "from KEGG"
+ assert rxn.notes["confidence_score"] == 2
+
+
+# --- regression: intra-batch met-id minting collision (known_issues.md A3) ---
+
+def test_intra_batch_id_minting_unique():
+ """Two source mets whose ids both collide with the draft and whose name[comp]
+ differs both get routed through new-id minting. The fix tracks ids minted in
+ the current batch so the two don't collapse to the same generated id."""
+ draft = cobra.Model("draft")
+ draft.add_metabolites([
+ cobra.Metabolite("atp_c", name="ATP-draft", compartment="c"),
+ cobra.Metabolite("adp_c", name="ADP-draft", compartment="c"),
+ ])
+ source = cobra.Model("source")
+ source.add_metabolites([
+ cobra.Metabolite("atp_c", name="ATP-source", compartment="c"),
+ cobra.Metabolite("adp_c", name="ADP-source", compartment="c"),
+ ])
+ rxn = cobra.Reaction("R1", lower_bound=0, upper_bound=1000)
+ source.add_reactions([rxn])
+ rxn.add_metabolites({
+ source.metabolites.get_by_id("atp_c"): -1,
+ source.metabolites.get_by_id("adp_c"): 1,
+ })
+ add_reactions_from_model(draft, source, "R1")
+ # Both source mets minted distinct ids (m1 and m2) — not a collision.
+ new_ids = sorted(m.id for m in draft.metabolites if m.id not in ("atp_c", "adp_c"))
+ assert len(new_ids) == 2 and len(set(new_ids)) == 2
diff --git a/tests/test_manipulation_transport.py b/tests/test_manipulation_transport.py
new file mode 100644
index 0000000..e8fb2b6
--- /dev/null
+++ b/tests/test_manipulation_transport.py
@@ -0,0 +1,98 @@
+"""Tests for add_transport_reactions (addTransport port)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import add_transport_reactions
+
+
+@pytest.fixture
+def model():
+ m = cobra.Model("t")
+ m.compartments = {"c": "cytoplasm", "m": "mitochondrion", "e": "extracellular"}
+ m.add_metabolites(
+ [
+ cobra.Metabolite("atp_c", name="ATP", formula="C10H16N5O13P3", charge=-4, compartment="c"),
+ cobra.Metabolite("h2o_c", name="H2O", formula="H2O", compartment="c"),
+ cobra.Metabolite("atp_m", name="ATP", compartment="m"), # exists in m
+ ]
+ )
+ return m
+
+
+def test_basic_transport_to_existing(model):
+ added = add_transport_reactions(model, "c", "m", ["ATP"])
+ assert len(added) == 1
+ rxn = added[0]
+ assert rxn.id == "tr_0001"
+ assert rxn.name == "ATP transport, cytoplasm-mitochondrion"
+ assert {m.id: rxn.get_coefficient(m.id) for m in rxn.metabolites} == {
+ "atp_c": -1.0,
+ "atp_m": 1.0,
+ }
+ assert rxn.reversibility is True
+
+
+def test_only_to_existing_skips_missing(model):
+ # H2O is not in m; with only_to_existing (default) it's skipped
+ added = add_transport_reactions(model, "c", "m", ["ATP", "H2O"])
+ assert [r.id for r in added] == ["tr_0001"] # only ATP
+
+
+def test_creates_missing_target_metabolite(model):
+ added = add_transport_reactions(
+ model, "c", "m", ["H2O"], only_to_existing=False
+ )
+ assert len(added) == 1
+ new = [mt for mt in model.metabolites if mt.name == "H2O" and mt.compartment == "m"]
+ assert len(new) == 1
+ assert new[0].formula == "H2O" # copied from source
+
+
+def test_copies_formula_and_charge(model):
+ add_transport_reactions(model, "c", "e", ["ATP"], only_to_existing=False)
+ new = [mt for mt in model.metabolites if mt.name == "ATP" and mt.compartment == "e"][0]
+ assert new.formula == "C10H16N5O13P3"
+ assert new.charge == -4
+
+
+def test_irreversible(model):
+ (rxn,) = add_transport_reactions(model, "c", "m", ["ATP"], reversible=False)
+ assert rxn.lower_bound == 0
+ assert rxn.reversibility is False
+
+
+def test_default_all_metabolites_in_from(model):
+ # default metabolite_names = all in c (ATP, H2O); to m, only_to_existing -> only ATP
+ added = add_transport_reactions(model, "c", "m")
+ assert [r.id for r in added] == ["tr_0001"]
+
+
+def test_multiple_target_compartments_and_sequential_ids(model):
+ added = add_transport_reactions(
+ model, "c", ["m", "e"], ["ATP"], only_to_existing=False
+ )
+ assert [r.id for r in added] == ["tr_0001", "tr_0002"]
+
+
+def test_unknown_compartment_raises(model):
+ with pytest.raises(ValueError, match="not in the model"):
+ add_transport_reactions(model, "x", "m", ["ATP"])
+
+
+def test_unknown_metabolite_raises(model):
+ with pytest.raises(ValueError, match="not found in compartment"):
+ add_transport_reactions(model, "c", "m", ["NOPE"])
+
+
+# --- regression: duplicate name in compartment (known_issues.md A4) --------
+
+def test_duplicate_name_in_source_compartment_warns(model):
+ """Two source mets sharing a name in the same compartment warn instead of
+ silently collapsing — previously one was dropped from the lookup dict."""
+ model.add_metabolites([
+ cobra.Metabolite("h2o2_c", name="H2O", compartment="c"), # duplicate name
+ ])
+ with pytest.warns(UserWarning, match="Multiple metabolites named 'H2O'"):
+ added = add_transport_reactions(model, "c", "m", ["H2O"], only_to_existing=False)
+ # Transport still works (uses the first match) — the warning is the signal.
+ assert len(added) == 1
diff --git a/tests/test_parameters.py b/tests/test_parameters.py
new file mode 100644
index 0000000..c0ab06c
--- /dev/null
+++ b/tests/test_parameters.py
@@ -0,0 +1,60 @@
+"""Tests for set_variance_bounds (the var mode of setParam)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import add_reactions_from_equations, set_variance_bounds
+
+
+@pytest.fixture
+def model():
+ m = cobra.Model("t")
+ m.add_metabolites(
+ [cobra.Metabolite("a_c", compartment="c"), cobra.Metabolite("b_c", compartment="c")]
+ )
+ add_reactions_from_equations(
+ m,
+ [
+ {"id": "R1", "equation": "a_c <=> b_c"},
+ {"id": "R2", "equation": "a_c <=> b_c"},
+ ],
+ )
+ return m
+
+
+def test_band_positive(model):
+ set_variance_bounds(model, "R1", 100, 5) # 97.5 .. 102.5
+ lb, ub = model.reactions.get_by_id("R1").bounds
+ assert lb == pytest.approx(97.5)
+ assert ub == pytest.approx(102.5)
+
+
+def test_band_negative_is_ordered(model):
+ set_variance_bounds(model, "R1", -100, 5)
+ lb, ub = model.reactions.get_by_id("R1").bounds
+ assert lb == pytest.approx(-102.5)
+ assert ub == pytest.approx(-97.5)
+ assert lb <= ub
+
+
+def test_broadcast_scalar(model):
+ set_variance_bounds(model, ["R1", "R2"], 50, 10)
+ for rid in ("R1", "R2"):
+ lb, ub = model.reactions.get_by_id(rid).bounds
+ assert lb == pytest.approx(47.5)
+ assert ub == pytest.approx(52.5)
+
+
+def test_per_reaction_values(model):
+ set_variance_bounds(model, ["R1", "R2"], [100, 200], 0)
+ assert model.reactions.get_by_id("R1").bounds == pytest.approx((100, 100))
+ assert model.reactions.get_by_id("R2").bounds == pytest.approx((200, 200))
+
+
+def test_length_mismatch_raises(model):
+ with pytest.raises(ValueError, match="to match the reactions"):
+ set_variance_bounds(model, ["R1", "R2"], [1, 2, 3], 5)
+
+
+def test_unknown_reaction_raises(model):
+ with pytest.raises(ValueError, match="not found"):
+ set_variance_bounds(model, "NOPE", 1, 5)
diff --git a/tests/test_reconstruction_blast.py b/tests/test_reconstruction_blast.py
new file mode 100644
index 0000000..32af556
--- /dev/null
+++ b/tests/test_reconstruction_blast.py
@@ -0,0 +1,78 @@
+"""Tests for run_blast / run_diamond / blast_from_table + the tabular parser."""
+import shutil
+
+import pandas as pd
+import pytest
+
+from raven_python.reconstruction.homology import HIT_COLUMNS, blast_from_table, run_blast
+from raven_python.reconstruction.homology.blast import _parse_tabular
+
+_SEQ = (
+ "MSTNPKPQRKTKRNTNRRPQDVKFPGGGQIVGGVYLLPRRGPRLGVRATRKTSERSQPRGRRQPIPKARRPEGRTWAQPGYPWPLYGNEGCGWAGWLLSPRG"
+)
+
+
+def test_parse_tabular_csv():
+ text = "tg1,ng1,1e-50,99.0,120,250.0,99.5\ntg2,ng2,0.0,100.0,200,400.0,100.0\n"
+ df = _parse_tabular(text, "templ", "org", sep=",")
+ assert list(df.columns) == HIT_COLUMNS
+ assert df.iloc[0].from_gene == "tg1" and df.iloc[0].to_gene == "ng1"
+ assert df.iloc[0].from_id == "templ" and df.iloc[0].to_id == "org"
+ assert df.iloc[1].identity == 100.0 and df.iloc[1].align_len == 200
+
+
+def test_parse_tabular_empty():
+ assert _parse_tabular("", "a", "b", sep=",").empty
+
+
+def test_blast_from_table_dataframe_roundtrip():
+ df = pd.DataFrame(
+ [["templ", "org", "tg1", "ng1", 0.0, 100.0, 100, 200.0, 100.0]],
+ columns=HIT_COLUMNS + ["extra"][:0], # exactly HIT_COLUMNS
+ )
+ out = blast_from_table(df)
+ assert list(out.columns) == HIT_COLUMNS
+ assert len(out) == 1
+
+
+def test_blast_from_table_csv(tmp_path):
+ p = tmp_path / "hits.csv"
+ pd.DataFrame(
+ [["templ", "org", "tg1", "ng1", 0.0, 100.0, 100, 200.0, 100.0]], columns=HIT_COLUMNS
+ ).to_csv(p, index=False)
+ out = blast_from_table(p)
+ assert out.iloc[0].from_gene == "tg1"
+
+
+def test_blast_from_table_missing_columns():
+ with pytest.raises(ValueError, match="missing required columns"):
+ blast_from_table(pd.DataFrame({"from_id": ["x"]}))
+
+
+def test_blast_from_table_csv_numeric_gene_ids_stay_str(tmp_path):
+ """All-numeric gene ids (e.g. Entrez) read as str, so they match model gene ids."""
+ p = tmp_path / "hits.csv"
+ pd.DataFrame(
+ [["templ", "org", 125, 4790, 0.0, 100.0, 100, 200.0, 100.0]], columns=HIT_COLUMNS
+ ).to_csv(p, index=False)
+ out = blast_from_table(p)
+ assert out.iloc[0].from_gene == "125" and out.iloc[0].to_gene == "4790"
+
+
+@pytest.mark.skipif(
+ not (shutil.which("blastp") and shutil.which("makeblastdb")), reason="BLAST+ not installed"
+)
+def test_run_blast_integration(tmp_path):
+ org = tmp_path / "org.faa"
+ ref = tmp_path / "templ.faa"
+ org.write_text(f">ngene\n{_SEQ}\n")
+ ref.write_text(f">tgene\n{_SEQ}\n") # identical sequence -> strong reciprocal hit
+
+ hits = run_blast("org", org, ["templ"], [ref])
+ assert list(hits.columns) == HIT_COLUMNS
+ assert not hits.empty
+ # both directions present
+ assert {("templ", "org"), ("org", "templ")} <= set(zip(hits.from_id, hits.to_id, strict=False))
+ # the reciprocal pair tgene<->ngene is found
+ fwd = hits[(hits.from_gene == "tgene") & (hits.to_gene == "ngene")]
+ assert not fwd.empty
diff --git a/tests/test_reconstruction_homology.py b/tests/test_reconstruction_homology.py
new file mode 100644
index 0000000..63ed72f
--- /dev/null
+++ b/tests/test_reconstruction_homology.py
@@ -0,0 +1,138 @@
+"""Tests for homology reconstruction core (make_ortholog_hits + get_model_from_homology)."""
+import cobra
+import pandas as pd
+import pytest
+
+from raven_python.manipulation import add_reactions_from_equations
+from raven_python.reconstruction.homology import (
+ HIT_COLUMNS,
+ get_model_from_homology,
+ make_ortholog_hits,
+)
+
+# --- make_ortholog_hits ----------------------------------------------------
+
+def test_make_ortholog_hits_bidirectional():
+ hits = make_ortholog_hits([("tA", "nA"), ("tB", "nB")], "template", "neworg")
+ assert list(hits.columns) == HIT_COLUMNS
+ assert len(hits) == 4 # 2 pairs x 2 directions
+ fwd = hits[(hits.from_id == "template") & (hits.from_gene == "tA")]
+ assert fwd.iloc[0].to_gene == "nA"
+ rev = hits[(hits.from_id == "neworg") & (hits.from_gene == "nA")]
+ assert rev.iloc[0].to_gene == "tA"
+
+
+def test_make_ortholog_hits_empty_raises():
+ with pytest.raises(ValueError, match="empty"):
+ make_ortholog_hits([], "t", "n")
+
+
+# --- template model fixture ------------------------------------------------
+
+def _template():
+ m = cobra.Model("templateGEM")
+ m.compartments = {"c": "cytoplasm"}
+ m.add_metabolites([cobra.Metabolite(x, name=x.upper(), compartment="c") for x in ("a", "b", "d")])
+ add_reactions_from_equations(
+ m,
+ [
+ {"id": "R_single", "equation": "a --> b", "gene_reaction_rule": "tg1"},
+ {"id": "R_iso", "equation": "b --> d", "gene_reaction_rule": "tg2 or tg3"},
+ {"id": "R_cplx", "equation": "a --> d", "gene_reaction_rule": "tg4 and tg5"},
+ ],
+ )
+ return m
+
+
+# --- one-to-one transfer ---------------------------------------------------
+
+def test_single_gene_reaction_transferred():
+ t = _template()
+ hits = make_ortholog_hits([("tg1", "ng1")], "templateGEM", "bug")
+ res = get_model_from_homology([t], hits, "bug")
+ assert res.model.id == "bug"
+ assert "R_single" in {r.id for r in res.model.reactions}
+ assert res.model.reactions.get_by_id("R_single").gene_reaction_rule == "ng1"
+
+
+def test_unsupported_reaction_dropped():
+ t = _template()
+ hits = make_ortholog_hits([("tg1", "ng1")], "templateGEM", "bug") # only tg1 mapped
+ res = get_model_from_homology([t], hits, "bug")
+ # R_iso (tg2/tg3) and R_cplx (tg4/tg5) have no ortholog -> dropped
+ assert {r.id for r in res.model.reactions} == {"R_single"}
+
+
+def test_one_to_many_orthologs_become_or():
+ t = _template()
+ hits = make_ortholog_hits([("tg1", "ngA"), ("tg1", "ngB")], "templateGEM", "bug")
+ res = get_model_from_homology([t], hits, "bug")
+ assert res.model.reactions.get_by_id("R_single").gene_reaction_rule == "ngA or ngB"
+
+
+# --- isozyme (OR) handling -------------------------------------------------
+
+def test_isozyme_branch_without_ortholog_dropped():
+ t = _template()
+ hits = make_ortholog_hits([("tg2", "ng2")], "templateGEM", "bug") # only one isozyme maps
+ res = get_model_from_homology([t], hits, "bug")
+ assert res.model.reactions.get_by_id("R_iso").gene_reaction_rule == "ng2"
+
+
+# --- complex (AND) policies ------------------------------------------------
+
+def _complex_hits():
+ # only tg4 of the tg4-and-tg5 complex has an ortholog
+ return make_ortholog_hits([("tg4", "ng4")], "templateGEM", "bug")
+
+
+def test_complex_policy_flag_keeps_old_marker():
+ res = get_model_from_homology([_template()], _complex_hits(), "bug", complex_policy="flag")
+ gpr = res.model.reactions.get_by_id("R_cplx").gene_reaction_rule
+ assert "ng4" in gpr and "OLD_templateGEM_tg5" in gpr and " and " in gpr
+
+
+def test_complex_policy_keep_drops_unmapped_subunit():
+ res = get_model_from_homology([_template()], _complex_hits(), "bug", complex_policy="keep")
+ assert res.model.reactions.get_by_id("R_cplx").gene_reaction_rule == "ng4"
+
+
+def test_complex_policy_drop_removes_reaction():
+ res = get_model_from_homology([_template()], _complex_hits(), "bug", complex_policy="drop")
+ assert "R_cplx" not in {r.id for r in res.model.reactions}
+
+
+# --- strictness alias + bidirectional --------------------------------------
+
+def test_strictness_alias_maps_params():
+ t = _template()
+ hits = make_ortholog_hits([("tg1", "ng1")], "templateGEM", "bug")
+ res = get_model_from_homology([t], hits, "bug", strictness=3) # bidir + best-hits
+ assert "R_single" in {r.id for r in res.model.reactions}
+
+
+def test_one_directional_non_reciprocal():
+ # build hits with only the new->old direction present
+ hits = make_ortholog_hits([("tg1", "ng1")], "templateGEM", "bug")
+ one_way = hits[hits.from_id == "bug"] # drop the template->new rows
+ t = _template()
+ # bidirectional default would find nothing; one-directional should map
+ assert "R_single" not in {r.id for r in get_model_from_homology([t], one_way, "bug").model.reactions}
+ res = get_model_from_homology([t], one_way, "bug", bidirectional=False, map_direction="new_to_old")
+ assert "R_single" in {r.id for r in res.model.reactions}
+
+
+# --- preferred order -------------------------------------------------------
+
+def test_preferred_order_routes_gene_to_one_model():
+ t1 = _template()
+ t1.id = "modelA"
+ t2 = _template()
+ t2.id = "modelB"
+ hits1 = make_ortholog_hits([("tg1", "ng1")], "modelA", "bug")
+ hits2 = make_ortholog_hits([("tg1", "ng1")], "modelB", "bug")
+ hits = pd.concat([hits1, hits2], ignore_index=True)
+ res = get_model_from_homology([t1, t2], hits, "bug", preferred_order=["modelA", "modelB"])
+ # ng1's reaction comes only from modelA
+ sources = {r.notes.get("homology_source") for r in res.model.reactions if r.id.startswith("R_single")}
+ assert sources == {"modelA"}
diff --git a/tests/test_reconstruction_kegg_download.py b/tests/test_reconstruction_kegg_download.py
new file mode 100644
index 0000000..38d2f44
--- /dev/null
+++ b/tests/test_reconstruction_kegg_download.py
@@ -0,0 +1,125 @@
+"""Tests for the KEGG download/arrange tooling (reconstruction/kegg/download.py).
+
+The network fetch needs a paid KEGG subscription, so it is not exercised here.
+We test credential resolution and the network-free extract/arrange core against
+hand-built fake archives.
+"""
+import gzip
+import io
+import tarfile
+from pathlib import Path
+
+import pytest
+
+from raven_python.reconstruction.kegg.download import (
+ _resolve_auth,
+ extract_kegg_dump,
+)
+
+
+def _make_targz(path: Path, members: dict[str, bytes]) -> None:
+ with tarfile.open(path, "w:gz") as tar:
+ for name, data in members.items():
+ info = tarfile.TarInfo(name)
+ info.size = len(data)
+ tar.addfile(info, io.BytesIO(data))
+
+
+def _make_gz(path: Path, data: bytes) -> None:
+ with gzip.open(path, "wb") as fh:
+ fh.write(data)
+
+
+# --------------------------------------------------------------------------- #
+# Credentials
+# --------------------------------------------------------------------------- #
+def test_resolve_auth_explicit_wins():
+ assert _resolve_auth("ftp.kegg.net", auth=("u", "p")) == ("u", "p")
+
+
+def test_resolve_auth_from_netrc(tmp_path):
+ netrc_file = tmp_path / ".netrc"
+ netrc_file.write_text("machine ftp.kegg.net login alice password s3cret\n")
+ netrc_file.chmod(0o600)
+ assert _resolve_auth("ftp.kegg.net", netrc_path=netrc_file) == ("alice", "s3cret")
+
+
+def test_resolve_auth_missing_file(tmp_path):
+ with pytest.raises(FileNotFoundError, match="does not exist"):
+ _resolve_auth("ftp.kegg.net", netrc_path=tmp_path / "nope")
+
+
+def test_resolve_auth_host_absent(tmp_path):
+ netrc_file = tmp_path / ".netrc"
+ netrc_file.write_text("machine other.host login a password b\n")
+ netrc_file.chmod(0o600)
+ with pytest.raises(ValueError, match="No credentials for"):
+ _resolve_auth("ftp.kegg.net", netrc_path=netrc_file)
+
+
+# --------------------------------------------------------------------------- #
+# Extract / arrange
+# --------------------------------------------------------------------------- #
+@pytest.fixture
+def fake_dump(tmp_path):
+ """A tmp dir populated with fake KEGG archives, as fetch would leave them."""
+ _make_targz(
+ tmp_path / "reaction.tar.gz",
+ {
+ "reaction/reaction": b"RXN_ENTRIES\n",
+ "reaction/reaction.lst": b"R00010: A <=> B\n",
+ "reaction/reaction_mapformula.lst": b"R00010: 00010: A => B\n",
+ "reaction/reaction.name": b"discard me\n", # extra file, not lifted
+ },
+ )
+ _make_targz(
+ tmp_path / "compound.tar.gz",
+ {"compound/compound": b"CPD\n", "compound/compound.inchi": b"C00031\tInChI=x\n"},
+ )
+ _make_targz(tmp_path / "glycan.tar.gz", {"glycan/glycan": b"GLY\n"})
+ _make_targz(tmp_path / "ko.tar.gz", {"ko/ko": b"KO\n"})
+ _make_gz(tmp_path / "eukaryotes.pep.gz", b">euk\nMKV\n")
+ _make_gz(tmp_path / "prokaryotes.pep.gz", b">prok\nMAA\n")
+ (tmp_path / "taxonomy").write_text("tax\n")
+ return tmp_path
+
+
+def test_extract_produces_flat_layout(fake_dump):
+ result = extract_kegg_dump(fake_dump)
+ expected = {
+ "reaction",
+ "reaction.lst",
+ "reaction_mapformula.lst",
+ "compound",
+ "compound.inchi",
+ "ko",
+ "genes.pep",
+ "taxonomy",
+ }
+ assert set(result) == expected
+ assert all(p.is_file() for p in result.values())
+
+
+def test_extract_concatenates_compound_and_glycan(fake_dump):
+ extract_kegg_dump(fake_dump)
+ assert (fake_dump / "compound").read_bytes() == b"CPD\nGLY\n"
+
+
+def test_extract_concatenates_proteomes(fake_dump):
+ extract_kegg_dump(fake_dump)
+ assert (fake_dump / "genes.pep").read_bytes() == b">euk\nMKV\n>prok\nMAA\n"
+
+
+def test_extract_removes_subdirs_and_archives(fake_dump):
+ extract_kegg_dump(fake_dump)
+ assert not list(fake_dump.glob("*.tar.gz"))
+ assert not list(fake_dump.glob("*.gz"))
+ for subdir in ("reaction", "compound", "glycan", "ko"):
+ assert not (fake_dump / subdir).is_dir()
+ assert not (fake_dump / "reaction.name").exists() # extra file discarded
+
+
+def test_extract_requires_core_archives(tmp_path):
+ _make_targz(tmp_path / "compound.tar.gz", {"compound/compound": b"CPD\n"})
+ with pytest.raises(FileNotFoundError, match="required file"):
+ extract_kegg_dump(tmp_path)
diff --git a/tests/test_reconstruction_kegg_hmm.py b/tests/test_reconstruction_kegg_hmm.py
new file mode 100644
index 0000000..3f559ec
--- /dev/null
+++ b/tests/test_reconstruction_kegg_hmm.py
@@ -0,0 +1,326 @@
+"""Tests for KEGG HMM-library construction (taxonomy + hmm, step 3b.3)."""
+from pathlib import Path
+
+import pandas as pd
+import pytest
+
+from raven_python.reconstruction.kegg import (
+ build_ko_fastas,
+ organism_domains,
+ organisms_in_domain,
+ parse_taxonomy,
+)
+from raven_python.reconstruction.kegg import hmm as hmm_mod
+from raven_python.reconstruction.kegg.hmm import (
+ _cdhit_cmd,
+ _cdhit_word_size,
+ _fasta_stats,
+ _hmmbuild_cmd,
+ _mafft_cmd,
+ build_ko_hmm,
+)
+
+DUMP = Path(__file__).parent / "data" / "kegg_dump"
+
+
+@pytest.fixture
+def organism_gene_ko():
+ return pd.DataFrame(
+ [
+ ("bsu", "BSU31050", "K01194"),
+ ("bsu", "BSU31060", "K01194"),
+ ("hsa", "124", "K01194"),
+ ("hsa", "125", "K01194"),
+ ("eco", "b0001", "K00002"),
+ ],
+ columns=["organism", "gene", "ko"],
+ )
+
+
+# --------------------------------------------------------------------------- #
+# Taxonomy
+# --------------------------------------------------------------------------- #
+def test_parse_taxonomy_lineages():
+ cats = parse_taxonomy(DUMP / "taxonomy")
+ assert cats["bsu"] == ["Prokaryotes", "Bacteria", "Firmicutes"]
+ assert cats["hsa"][0] == "Eukaryotes"
+ assert cats["eco"][1] == "Bacteria"
+
+
+def test_organism_domains():
+ assert organism_domains(DUMP / "taxonomy") == {
+ "bsu": "Prokaryotes",
+ "eco": "Prokaryotes",
+ "hsa": "Eukaryotes",
+ }
+
+
+def test_organisms_in_domain_prefix_match():
+ assert organisms_in_domain(DUMP / "taxonomy", "prok") == {"bsu", "eco"}
+ assert organisms_in_domain(DUMP / "taxonomy", "Eukaryotes") == {"hsa"}
+
+
+def test_parse_taxonomy_handles_skipped_depth(tmp_path):
+ """A ``##`` directly under a ``#`` (skipping ``##`` level) used to corrupt
+ the stack. Now pads with '' placeholders and warns once (known_issues.md C4)."""
+ p = tmp_path / "tax"
+ p.write_text(
+ "#Domain1\n"
+ "###Skipped\n" # skips ##
+ "T9999\torg1\tan org\n"
+ )
+ with pytest.warns(UserWarning, match="depth skips a level"):
+ cats = parse_taxonomy(p)
+ # Domain still recoverable; the missing level is a placeholder.
+ assert cats["org1"][0] == "Domain1"
+ assert cats["org1"][-1] == "Skipped"
+
+
+# --------------------------------------------------------------------------- #
+# build_ko_fastas (constructMultiFasta)
+# --------------------------------------------------------------------------- #
+def test_build_ko_fastas_groups_by_ko(organism_gene_ko, tmp_path):
+ written = build_ko_fastas(organism_gene_ko, DUMP / "genes.pep", tmp_path)
+ assert set(written) == {"K01194", "K00002"}
+ k01194 = (tmp_path / "K01194.fa").read_text()
+ assert k01194.count(">") == 4 # bsu x2 + hsa x2
+ assert ">bsu:BSU31050" in k01194
+ assert ">xxx:unused" not in k01194 # gene not in any KO is excluded
+
+
+def test_build_ko_fastas_domain_filter(organism_gene_ko, tmp_path):
+ prok = organisms_in_domain(DUMP / "taxonomy", "prokaryotes")
+ written = build_ko_fastas(organism_gene_ko, DUMP / "genes.pep", tmp_path, organisms=prok)
+ # Only prokaryote genes: K01194 keeps bsu (2), K00002 keeps eco (1).
+ assert (tmp_path / "K01194.fa").read_text().count(">") == 2
+ assert ">hsa:" not in (tmp_path / "K01194.fa").read_text()
+ assert set(written) == {"K01194", "K00002"}
+
+
+def test_build_ko_fastas_sequences_intact(organism_gene_ko, tmp_path):
+ build_ko_fastas(organism_gene_ko, DUMP / "genes.pep", tmp_path)
+ text = (tmp_path / "K00002.fa").read_text()
+ assert text.startswith(">eco:b0001")
+ assert "MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNHLVAMIEKTISGQDA" in text
+
+
+# --------------------------------------------------------------------------- #
+# Command builders / CD-HIT word size (pure)
+# --------------------------------------------------------------------------- #
+@pytest.mark.parametrize(
+ "identity, expected",
+ [(0.9, "5"), (0.7, "4"), (0.65, "4"), (0.55, "3"), (0.45, "2")],
+)
+def test_cdhit_word_size(identity, expected):
+ assert _cdhit_word_size(identity) == expected
+
+
+def test_cdhit_word_size_out_of_range():
+ with pytest.raises(ValueError, match="seq_identity"):
+ _cdhit_word_size(0.3)
+
+
+def test_command_builders():
+ cd = _cdhit_cmd("cd-hit", Path("in.fa"), Path("out.fa"), 0.9, 4)
+ assert cd[:3] == ["cd-hit", "-i", "in.fa"]
+ assert "-c" in cd and "0.9" in cd and "-n" in cd and "5" in cd
+ # Default is fast progressive (FFT-NS-2), not --auto.
+ assert _mafft_cmd("mafft", Path("in.fa"), 2) == [
+ "mafft", "--retree", "2", "--maxiterate", "0", "--anysymbol", "--thread", "2", "in.fa"
+ ]
+ assert _mafft_cmd("mafft", Path("in.fa"), 2, fast=False)[:2] == ["mafft", "--auto"]
+ assert "--parttree" in _mafft_cmd("mafft", Path("in.fa"), 2, parttree=True)
+ assert _hmmbuild_cmd("hmmbuild", Path("o.hmm"), Path("a.fa"), 3) == [
+ "hmmbuild", "--cpu", "3", "o.hmm", "a.fa"
+ ]
+
+
+# --------------------------------------------------------------------------- #
+# build_ko_hmm orchestration (binaries mocked)
+# --------------------------------------------------------------------------- #
+def test_build_ko_hmm_multi_sequence_runs_full_pipeline(tmp_path, monkeypatch):
+ fasta = tmp_path / "K01194.fa"
+ fasta.write_text(">a\nMKV\n>b\nMRV\n")
+ calls = []
+
+ monkeypatch.setattr(
+ "raven_python.reconstruction.kegg.hmm.resolve_binary",
+ lambda exe, binary=None: binary or exe,
+ )
+
+ def fake_run(cmd, *, stdout_path=None):
+ calls.append(Path(cmd[0]).name)
+ # Emulate each tool producing its expected output file.
+ if stdout_path is not None:
+ Path(stdout_path).write_text(">a\nMKV\n>b\nMRV\n")
+ if Path(cmd[0]).name == "cd-hit":
+ Path(cmd[cmd.index("-o") + 1]).write_text(">a\nMKV\n>b\nMRV\n")
+ if Path(cmd[0]).name == "hmmbuild":
+ Path(cmd[-2]).write_text("HMM\n")
+ return ""
+
+ monkeypatch.setattr("raven_python.reconstruction.kegg.hmm._run", fake_run)
+ out = build_ko_hmm(fasta, tmp_path / "K01194.hmm")
+ assert calls == ["cd-hit", "mafft", "hmmbuild"]
+ assert out.read_text() == "HMM\n"
+
+
+def test_build_ko_hmm_single_sequence_skips_align(tmp_path, monkeypatch):
+ fasta = tmp_path / "K9.fa"
+ fasta.write_text(">only\nMKV\n")
+ calls = []
+ monkeypatch.setattr(
+ "raven_python.reconstruction.kegg.hmm.resolve_binary",
+ lambda exe, binary=None: binary or exe,
+ )
+
+ def fake_run(cmd, *, stdout_path=None):
+ calls.append(Path(cmd[0]).name)
+ if Path(cmd[0]).name == "hmmbuild":
+ Path(cmd[-2]).write_text("HMM\n")
+ return ""
+
+ monkeypatch.setattr("raven_python.reconstruction.kegg.hmm._run", fake_run)
+ build_ko_hmm(fasta, tmp_path / "K9.hmm")
+ assert calls == ["hmmbuild"] # no cd-hit / mafft for a lone sequence
+
+
+def test_build_ko_hmm_verbose_logs_each_stage(tmp_path, monkeypatch, caplog):
+ fasta = tmp_path / "K01194.fa"
+ fasta.write_text(">a\nMKV\n>b\nMRV\n")
+ monkeypatch.setattr(
+ "raven_python.reconstruction.kegg.hmm.resolve_binary", lambda exe, binary=None: binary or exe
+ )
+
+ def fake_run(cmd, *, stdout_path=None):
+ if stdout_path is not None:
+ Path(stdout_path).write_text(">a\nMKV\n>b\nMRV\n")
+ if Path(cmd[0]).name == "cd-hit":
+ Path(cmd[cmd.index("-o") + 1]).write_text(">a\nMKV\n>b\nMRV\n")
+ if Path(cmd[0]).name == "hmmbuild":
+ Path(cmd[-2]).write_text("HMM\n")
+ return ""
+
+ monkeypatch.setattr("raven_python.reconstruction.kegg.hmm._run", fake_run)
+ with caplog.at_level("INFO", logger="raven_python.reconstruction.kegg.hmm"):
+ build_ko_hmm(fasta, tmp_path / "K01194.hmm", verbose=True)
+ text = caplog.text
+ # Each stage is logged, labelled with the KO id.
+ assert "[K01194] start: 2 sequences" in text
+ assert "[K01194] CD-HIT" in text
+ assert "[K01194] MAFFT" in text
+ assert "[K01194] hmmbuild: done in" in text
+ # Each stage is a single line: the tool/params and the timing together, not split.
+ assert "running" not in text
+ assert "[K01194] complete" in text
+
+
+def test_build_ko_hmm_quiet_by_default(tmp_path, monkeypatch, caplog):
+ fasta = tmp_path / "K9.fa"
+ fasta.write_text(">only\nMKV\n")
+ monkeypatch.setattr(
+ "raven_python.reconstruction.kegg.hmm.resolve_binary", lambda exe, binary=None: binary or exe
+ )
+ monkeypatch.setattr(
+ "raven_python.reconstruction.kegg.hmm._run",
+ lambda cmd, *, stdout_path=None: Path(cmd[-2]).write_text("HMM\n") and "",
+ )
+ with caplog.at_level("INFO", logger="raven_python.reconstruction.kegg.hmm"):
+ build_ko_hmm(fasta, tmp_path / "K9.hmm") # verbose defaults False
+ assert caplog.text == ""
+
+
+def test_fasta_stats_counts_residues(tmp_path):
+ fa = tmp_path / "x.fa"
+ fa.write_text(">a\nMKVL\nAAG\n>b\nMR\n") # a=7 residues (2 lines), b=2
+ assert _fasta_stats(fa) == (2, 9)
+
+
+def test_auto_cost_budget_scales_with_memory(monkeypatch):
+ hmm_mod._auto_cost_budget.cache_clear()
+ monkeypatch.setattr(hmm_mod, "_total_memory_bytes", lambda: 64 * 1024**3)
+ big = hmm_mod._auto_cost_budget()
+ hmm_mod._auto_cost_budget.cache_clear()
+ monkeypatch.setattr(hmm_mod, "_total_memory_bytes", lambda: 8 * 1024**3)
+ small = hmm_mod._auto_cost_budget()
+ assert big > small > 0 # more RAM -> larger DP-cost budget
+ hmm_mod._auto_cost_budget.cache_clear()
+
+
+def test_auto_cost_budget_warns_on_low_memory(monkeypatch, caplog):
+ hmm_mod._auto_cost_budget.cache_clear()
+ monkeypatch.setattr(hmm_mod, "_total_memory_bytes", lambda: 7 * 1024**3)
+ with caplog.at_level("WARNING", logger="raven_python.reconstruction.kegg.hmm"):
+ hmm_mod._auto_cost_budget()
+ assert "Limited memory" in caplog.text
+ hmm_mod._auto_cost_budget.cache_clear()
+
+
+def test_auto_cost_budget_falls_back_without_detection(monkeypatch, caplog):
+ hmm_mod._auto_cost_budget.cache_clear()
+ monkeypatch.setattr(hmm_mod, "_total_memory_bytes", lambda: None)
+ with caplog.at_level("WARNING", logger="raven_python.reconstruction.kegg.hmm"):
+ assert hmm_mod._auto_cost_budget() == hmm_mod._DEFAULT_COST_BUDGET
+ assert "Could not detect system memory" in caplog.text
+ hmm_mod._auto_cost_budget.cache_clear()
+
+
+def test_long_proteins_route_to_parttree(monkeypatch, tmp_path):
+ # Few but very long sequences (K12047-like): low residue count, high DP cost,
+ # so the length-aware budget must pick PartTree (a residue-only rule would not).
+ fasta = tmp_path / "K12047.fa"
+ fasta.write_text("".join(f">g{i}\n{'M' * 2000}\n" for i in range(300))) # 300 x 2000 aa
+ monkeypatch.setattr(hmm_mod, "resolve_binary", lambda exe, binary=None: binary or exe)
+ hmm_mod._auto_cost_budget.cache_clear()
+ monkeypatch.setattr(hmm_mod, "_total_memory_bytes", lambda: 8 * 1024**3)
+ seen = {}
+
+ def fake_run(cmd, *, stdout_path=None):
+ name = Path(cmd[0]).name
+ if name == "cd-hit":
+ Path(cmd[cmd.index("-o") + 1]).write_text(fasta.read_text())
+ if name == "mafft":
+ seen["parttree"] = "--parttree" in cmd
+ Path(stdout_path).write_text(fasta.read_text())
+ if name == "hmmbuild":
+ Path(cmd[-2]).write_text("HMM\n")
+ return ""
+
+ monkeypatch.setattr(hmm_mod, "_run", fake_run)
+ build_ko_hmm(fasta, tmp_path / "K12047.hmm")
+ hmm_mod._auto_cost_budget.cache_clear()
+ # 300x2000 = 600k residues (a residue rule with a ~1M cutoff would NOT trigger),
+ # but DP cost 1.2e9 exceeds the 8 GB budget -> PartTree.
+ assert seen["parttree"] is True
+
+
+def test_parttree_residues_param_overrides_auto(tmp_path, monkeypatch):
+ # The explicit parttree_residues argument decides the MAFFT method (residues only).
+ fasta = tmp_path / "K.fa"
+ fasta.write_text("".join(f">g{i}\n{'M' * 1000}\n" for i in range(5))) # 5000 residues
+ monkeypatch.setattr(hmm_mod, "resolve_binary", lambda exe, binary=None: binary or exe)
+ seen = {}
+
+ def fake_run(cmd, *, stdout_path=None):
+ name = Path(cmd[0]).name
+ if name == "cd-hit":
+ Path(cmd[cmd.index("-o") + 1]).write_text(fasta.read_text())
+ if name == "mafft":
+ seen["parttree"] = "--parttree" in cmd
+ Path(stdout_path).write_text(fasta.read_text())
+ if name == "hmmbuild":
+ Path(cmd[-2]).write_text("HMM\n")
+ return ""
+
+ monkeypatch.setattr(hmm_mod, "_run", fake_run)
+ build_ko_hmm(fasta, tmp_path / "a.hmm", parttree_residues=10_000) # 5000 < 10000
+ assert seen["parttree"] is False # stays on FFT-NS-2
+ build_ko_hmm(fasta, tmp_path / "b.hmm", parttree_residues=4000) # 5000 > 4000
+ assert seen["parttree"] is True # switches to PartTree
+
+
+def test_build_ko_hmm_empty_fasta_raises(tmp_path):
+ fasta = tmp_path / "empty.fa"
+ fasta.write_text("")
+ with pytest.raises(ValueError, match="no sequences"):
+ build_ko_hmm(fasta, tmp_path / "empty.hmm")
diff --git a/tests/test_reconstruction_kegg_organism.py b/tests/test_reconstruction_kegg_organism.py
new file mode 100644
index 0000000..f64f15b
--- /dev/null
+++ b/tests/test_reconstruction_kegg_organism.py
@@ -0,0 +1,179 @@
+"""Tests for get_kegg_model_for_organism (KEGG organism-ID mode, step 3b.4)."""
+from pathlib import Path
+
+import cobra
+import pandas as pd
+import pytest
+
+from raven_python.reconstruction.kegg import (
+ build_kegg_tables,
+ build_reference_model,
+ get_kegg_model_for_organism,
+ get_kegg_model_for_organism_from_artefacts,
+ parse_kegg_compounds,
+ parse_kegg_dump,
+ parse_kegg_reactions,
+)
+
+DUMP = Path(__file__).parent / "data" / "kegg_dump"
+
+
+@pytest.fixture(scope="module")
+def artefacts():
+ reactions = parse_kegg_reactions(DUMP)
+ compounds = parse_kegg_compounds(DUMP)
+ linked = {ko for r in reactions for ko in r.kos}
+ from raven_python.reconstruction.kegg import parse_kegg_kos
+
+ kos = parse_kegg_kos(DUMP, keep=linked)
+ model = build_reference_model(reactions, compounds)
+ tables = build_kegg_tables(reactions, kos)
+ return model, tables
+
+
+def _build(artefacts, organism_id, **kw):
+ model, tables = artefacts
+ return get_kegg_model_for_organism(
+ organism_id,
+ model,
+ tables["ko_reaction"],
+ tables["organism_gene_ko"],
+ rxn_flags=tables["rxn_flags"],
+ **kw,
+ )
+
+
+# --------------------------------------------------------------------------- #
+# Core behaviour
+# --------------------------------------------------------------------------- #
+def test_eco_keeps_only_its_reactions(artefacts):
+ # eco has b0001 -> K00002 -> R00100 only.
+ model = _build(artefacts, "eco")
+ assert {r.id for r in model.reactions} == {"R00100"}
+ assert model.id == "eco"
+
+
+def test_eco_gpr_and_gene_annotation(artefacts):
+ model = _build(artefacts, "eco")
+ r = model.reactions.get_by_id("R00100")
+ assert r.gene_reaction_rule == "b0001"
+ assert model.genes.get_by_id("b0001").annotation["kegg.genes"] == "eco:b0001"
+ assert r.notes["note"].startswith("Included by get_kegg_model_for_organism")
+
+
+def test_bsu_or_joins_multiple_genes(artefacts):
+ # bsu has BSU31050 + BSU31060, both -> K01194 -> R00010.
+ model = _build(artefacts, "bsu")
+ r = model.reactions.get_by_id("R00010")
+ assert set(r.genes) == {model.genes.get_by_id("BSU31050"), model.genes.get_by_id("BSU31060")}
+ assert r.gene_reaction_rule == "BSU31050 or BSU31060"
+
+
+def test_case_insensitive_organism(artefacts):
+ assert "R00010" in _build(artefacts, "BSU").reactions
+
+
+def test_orphan_metabolites_pruned(artefacts):
+ # eco keeps only R00100 (C00002, C00003); trehalose/glucose mets should go.
+ model = _build(artefacts, "eco")
+ assert {m.id for m in model.metabolites} == {"C00002", "C00003"}
+
+
+def test_reference_model_unmodified(artefacts):
+ reference, _ = artefacts
+ before = len(reference.reactions)
+ _build(artefacts, "eco")
+ assert len(reference.reactions) == before # worked on a copy
+ assert len(reference.genes) == 0
+
+
+# --------------------------------------------------------------------------- #
+# Spontaneous handling
+# --------------------------------------------------------------------------- #
+def test_spontaneous_reaction_kept_without_genes(artefacts):
+ # R00100 is spontaneous; for bsu it has no genes but is kept (no GPR).
+ model = _build(artefacts, "bsu", keep_spontaneous=True)
+ assert "R00100" in model.reactions
+ assert model.reactions.get_by_id("R00100").gene_reaction_rule == ""
+
+
+def test_spontaneous_dropped_when_disabled(artefacts):
+ model = _build(artefacts, "bsu", keep_spontaneous=False)
+ assert "R00100" not in model.reactions
+ assert "R00010" in model.reactions # the gene-backed reaction stays
+
+
+# --------------------------------------------------------------------------- #
+# Quality filters take precedence over having genes
+# --------------------------------------------------------------------------- #
+def _tiny_general_case():
+ ref = cobra.Model("KEGG")
+ a = cobra.Metabolite("C1", compartment="s")
+ b = cobra.Metabolite("C2", compartment="s")
+ ref.add_metabolites([a, b])
+ rxn = cobra.Reaction("R1")
+ ref.add_reactions([rxn])
+ rxn.add_metabolites({a: -1, b: 1})
+ ko_reaction = pd.DataFrame([("K1", "R1")], columns=["ko", "reaction"])
+ ogk = pd.DataFrame([("xyz", "g1", "K1")], columns=["organism", "gene", "ko"])
+ flags = pd.DataFrame(
+ [("R1", False, False, False, True)],
+ columns=["reaction", "spontaneous", "undefined_stoich", "incomplete", "general"],
+ )
+ return ref, ko_reaction, ogk, flags
+
+
+def test_general_filter_drops_reaction_with_genes():
+ ref, ko_reaction, ogk, flags = _tiny_general_case()
+ model = get_kegg_model_for_organism("xyz", ref, ko_reaction, ogk, rxn_flags=flags)
+ assert "R1" not in model.reactions # general + keep_general=False (default)
+
+
+def test_general_kept_when_enabled():
+ ref, ko_reaction, ogk, flags = _tiny_general_case()
+ model = get_kegg_model_for_organism(
+ "xyz", ref, ko_reaction, ogk, rxn_flags=flags, keep_general=True
+ )
+ assert model.reactions.get_by_id("R1").gene_reaction_rule == "g1"
+
+
+# --------------------------------------------------------------------------- #
+# Validation + artefact loading
+# --------------------------------------------------------------------------- #
+def test_unknown_organism_raises(artefacts):
+ with pytest.raises(ValueError, match="no genes"):
+ _build(artefacts, "zzz")
+
+
+def test_domain_mode_needs_taxonomy(artefacts):
+ with pytest.raises(ValueError, match="taxonomy"):
+ _build(artefacts, "eukaryotes")
+
+
+def test_domain_mode_keeps_all_domain_organisms(artefacts):
+ # Prokaryotes (bsu + eco) -> R00010 (bsu genes) and R00100 (eco gene).
+ model = _build(artefacts, "prokaryotes", taxonomy=DUMP / "taxonomy")
+ assert "R00010" in model.reactions
+ assert "R00100" in model.reactions
+ # Genes are organism-qualified in domain mode to stay distinct.
+ assert {g.id for g in model.reactions.get_by_id("R00010").genes} == {
+ "bsu:BSU31050",
+ "bsu:BSU31060",
+ }
+
+
+def test_domain_mode_eukaryotes(artefacts):
+ # Eukaryotes (hsa) -> R00010 via hsa:124/125; eco-only R00100 absent of genes
+ # but it is spontaneous, so kept without GPR.
+ model = _build(artefacts, "eukaryotes", taxonomy=DUMP / "taxonomy")
+ assert {g.id for g in model.reactions.get_by_id("R00010").genes} == {
+ "hsa:124",
+ "hsa:125",
+ }
+
+
+def test_from_artefacts_roundtrip(tmp_path):
+ parse_kegg_dump(DUMP, tmp_path)
+ model = get_kegg_model_for_organism_from_artefacts("eco", tmp_path)
+ assert {r.id for r in model.reactions} == {"R00100"}
+ assert model.reactions.get_by_id("R00100").gene_reaction_rule == "b0001"
diff --git a/tests/test_reconstruction_kegg_parse.py b/tests/test_reconstruction_kegg_parse.py
new file mode 100644
index 0000000..23d8f71
--- /dev/null
+++ b/tests/test_reconstruction_kegg_parse.py
@@ -0,0 +1,220 @@
+"""Tests for the KEGG dump parser (reconstruction/kegg/parse.py, step 3b.2)."""
+from pathlib import Path
+
+import pytest
+
+from raven_python.reconstruction.kegg import (
+ build_kegg_tables,
+ build_reference_model,
+ parse_kegg_compounds,
+ parse_kegg_dump,
+ parse_kegg_kos,
+ parse_kegg_reactions,
+ read_kegg_table,
+ write_kegg_tables,
+)
+
+DUMP = Path(__file__).parent / "data" / "kegg_dump"
+
+
+@pytest.fixture(scope="module")
+def reactions():
+ return parse_kegg_reactions(DUMP)
+
+
+@pytest.fixture(scope="module")
+def compounds():
+ return parse_kegg_compounds(DUMP)
+
+
+@pytest.fixture(scope="module")
+def kos():
+ linked = {ko for r in parse_kegg_reactions(DUMP) for ko in r.kos}
+ return parse_kegg_kos(DUMP, keep=linked)
+
+
+# --------------------------------------------------------------------------- #
+# Reactions
+# --------------------------------------------------------------------------- #
+def test_reactions_parsed(reactions):
+ assert {r.id for r in reactions} == {"R00010", "R00100", "R00200", "R00300", "R00400"}
+
+
+def test_reaction_fields(reactions):
+ r = next(r for r in reactions if r.id == "R00010")
+ assert r.name == "alpha,alpha-trehalose glucohydrolase"
+ assert r.eccodes == ["3.2.1.28"]
+ assert r.kos == ["K01194"]
+ # rn01100 is an overview map and must be skipped.
+ assert r.pathways == ["rn00500"]
+
+
+def test_stoichiometry_cached(reactions):
+ """parse_kegg_reactions populates the cached stoichiometry so
+ build_reference_model doesn't have to re-parse (known_issues.md D2)."""
+ r = next(r for r in reactions if r.id == "R00010")
+ assert r.stoichiometry # non-empty
+ # Reactants negative, products positive.
+ assert all(c != 0 for c in r.stoichiometry.values())
+ assert any(c < 0 for c in r.stoichiometry.values())
+ assert any(c > 0 for c in r.stoichiometry.values())
+
+
+def test_spontaneous_flag(reactions):
+ assert next(r for r in reactions if r.id == "R00100").spontaneous
+ assert not next(r for r in reactions if r.id == "R00010").spontaneous
+
+
+def test_general_flag(reactions):
+ assert next(r for r in reactions if r.id == "R00300").general
+
+
+def test_undefined_stoich_flag(reactions):
+ assert next(r for r in reactions if r.id == "R00200").undefined_stoich
+ assert not next(r for r in reactions if r.id == "R00010").undefined_stoich
+
+
+def test_mapformula_makes_irreversible(reactions):
+ # R00100 is drawn one direction in its only map -> irreversible.
+ assert not next(r for r in reactions if r.id == "R00100").reversible
+ # R00010 is drawn in conflicting directions across maps -> stays reversible.
+ assert next(r for r in reactions if r.id == "R00010").reversible
+
+
+# --------------------------------------------------------------------------- #
+# Compounds
+# --------------------------------------------------------------------------- #
+def test_compound_first_name_only(compounds):
+ water = next(c for c in compounds if c.id == "C00001")
+ assert water.name == "H2O"
+ assert water.chebi == ["CHEBI:15377"]
+ assert water.pubchem == ["3303"]
+
+
+def test_inchi_overrides_formula(compounds):
+ glucose = next(c for c in compounds if c.id == "C00031")
+ assert glucose.inchi.startswith("InChI=")
+ assert glucose.formula == "" # cleared when an InChI is available
+ assert glucose.chebi == ["CHEBI:4167", "CHEBI:17634"]
+
+
+# --------------------------------------------------------------------------- #
+# KOs / genes
+# --------------------------------------------------------------------------- #
+def test_kos_limited_to_keep(kos):
+ # K99999 is unlinked (excluded by keep); K09999 is referenced but absent.
+ assert {ko.id for ko in kos} == {"K01194", "K00002"}
+
+
+def test_ko_genes_lowercased_and_stripped(kos):
+ k = next(ko for ko in kos if ko.id == "K01194")
+ assert k.name == "alpha,alpha-trehalase [EC:3.2.1.28]"
+ assert ("bsu", "BSU31050") in k.genes # '(gbsB)' suffix stripped, org lowercased
+ assert ("hsa", "125") in k.genes
+
+
+# --------------------------------------------------------------------------- #
+# Reference model
+# --------------------------------------------------------------------------- #
+def test_reference_model_is_gene_free(reactions, compounds):
+ model = build_reference_model(reactions, compounds)
+ assert len(model.genes) == 0
+ for rxn in model.reactions:
+ assert rxn.gene_reaction_rule == ""
+
+
+def test_empty_reaction_dropped(reactions, compounds):
+ model = build_reference_model(reactions, compounds)
+ assert "R00400" not in model.reactions # C00007 <=> C00007 cancels out
+ assert "C00007" not in model.metabolites # and its only metabolite is unused
+
+
+def test_reaction_bounds_follow_reversibility(reactions, compounds):
+ model = build_reference_model(reactions, compounds)
+ assert model.reactions.get_by_id("R00010").bounds == (-1000.0, 1000.0)
+ assert model.reactions.get_by_id("R00100").bounds == (0.0, 1000.0)
+
+
+def test_reaction_stoichiometry_and_annotation(reactions, compounds):
+ model = build_reference_model(reactions, compounds)
+ r = model.reactions.get_by_id("R00010")
+ coefs = {m.id: c for m, c in r.metabolites.items()}
+ assert coefs == {"C01083": -1.0, "C00001": -1.0, "C00031": 2.0}
+ assert r.annotation["kegg.orthology"] == ["K01194"]
+ assert r.annotation["ec-code"] == ["3.2.1.28"]
+
+
+def test_metabolite_annotation(reactions, compounds):
+ model = build_reference_model(reactions, compounds)
+ glucose = model.metabolites.get_by_id("C00031")
+ assert glucose.name == "D-Glucose"
+ assert glucose.annotation["inchi"].startswith("InChI=")
+
+
+# --------------------------------------------------------------------------- #
+# Tables
+# --------------------------------------------------------------------------- #
+def test_ko_reaction_table(reactions, kos):
+ tables = build_kegg_tables(reactions, kos)
+ pairs = set(map(tuple, tables["ko_reaction"].to_numpy()))
+ assert ("K01194", "R00010") in pairs
+ assert ("K09999", "R00300") in pairs # kept even though KO entry is missing
+
+
+def test_organism_gene_ko_table(reactions, kos):
+ tables = build_kegg_tables(reactions, kos)
+ rows = set(map(tuple, tables["organism_gene_ko"].to_numpy()))
+ assert ("bsu", "BSU31050", "K01194") in rows
+ assert ("eco", "b0001", "K00002") in rows
+ assert len(rows) == 5
+
+
+def test_rxn_flags_table(reactions, kos):
+ tables = build_kegg_tables(reactions, kos)
+ flags = tables["rxn_flags"].set_index("reaction")
+ assert bool(flags.loc["R00100", "spontaneous"])
+ assert bool(flags.loc["R00200", "undefined_stoich"])
+ assert bool(flags.loc["R00300", "general"])
+ assert not bool(flags.loc["R00010", "spontaneous"])
+
+
+# --------------------------------------------------------------------------- #
+# Round-trip + orchestrator
+# --------------------------------------------------------------------------- #
+def test_tables_roundtrip_gzipped_tsv(reactions, kos, tmp_path):
+ tables = build_kegg_tables(reactions, kos)
+ paths = write_kegg_tables(tables, tmp_path)
+ assert all(p.name.endswith(".tsv.gz") for p in paths)
+ back = read_kegg_table(tmp_path / "ko_reaction.tsv.gz")
+ assert set(map(tuple, back.to_numpy())) == set(map(tuple, tables["ko_reaction"].to_numpy()))
+
+
+def test_parse_kegg_dump_writes_artefacts(tmp_path):
+ paths = parse_kegg_dump(DUMP, tmp_path)
+ assert set(paths) >= {
+ "ko_reaction", "ko_names", "organism_gene_ko", "rxn_flags", "reference_model"
+ }
+ assert (tmp_path / "reference_model.yml.gz").is_file()
+ # organism_gene_ko is streamed to a sorted, xz-compressed TSV.
+ assert paths["organism_gene_ko"].name == "organism_gene_ko.tsv.xz"
+ ogk = read_kegg_table(paths["organism_gene_ko"])
+ assert set(ogk.columns) == {"organism", "gene", "ko"}
+ assert ("eco", "b0001", "K00002") in set(map(tuple, ogk.to_numpy()))
+ # Rows are sorted by (organism, gene) — the property that makes them compress.
+ keys = list(zip(ogk["organism"], ogk["gene"], strict=True))
+ assert keys == sorted(keys)
+
+
+def test_stream_organism_gene_ko_external_merge(tmp_path):
+ """A tiny chunk_rows forces multiple sorted runs to be merged; output stays sorted."""
+ from raven_python.reconstruction.kegg.parse import stream_organism_gene_ko
+
+ out = tmp_path / "organism_gene_ko.tsv.xz"
+ keep = {ko.id for ko in parse_kegg_kos(DUMP)}
+ names = stream_organism_gene_ko(DUMP, keep, out, chunk_rows=1)
+ assert out.is_file() and not list(tmp_path.glob("ogk_sort_*")) # temp dir cleaned up
+ ogk = read_kegg_table(out)
+ keys = list(zip(ogk["organism"], ogk["gene"], strict=True))
+ assert keys == sorted(keys)
+ assert ("eco", "b0001", "K00002") in set(map(tuple, ogk.to_numpy()))
+ assert set(names.columns) == {"ko", "name"}
diff --git a/tests/test_reconstruction_kegg_query.py b/tests/test_reconstruction_kegg_query.py
new file mode 100644
index 0000000..49aae60
--- /dev/null
+++ b/tests/test_reconstruction_kegg_query.py
@@ -0,0 +1,132 @@
+"""Tests for the KEGG HMM-query path (reconstruction/kegg/query.py, step 3b.5)."""
+from pathlib import Path
+
+import pandas as pd
+import pytest
+
+from raven_python.reconstruction.kegg import (
+ assign_kos,
+ build_kegg_tables,
+ build_reference_model,
+ get_kegg_model_from_sequences,
+ parse_hmmscan_tblout,
+ parse_kegg_compounds,
+ parse_kegg_kos,
+ parse_kegg_reactions,
+)
+
+DUMP = Path(__file__).parent / "data" / "kegg_dump"
+
+# A minimal hmmscan --tblout excerpt: target(KO) accession query(gene) ... evalue ...
+TBLOUT = """\
+# --- full sequence ----
+# target name accession query name accession E-value score bias
+#------------------- ---------- ----------- ---------- --------- ------ -----
+K01194 - gene1 - 1e-120 400.0 0.0
+K01194 - gene2 - 1e-100 350.0 0.0
+K00002 - gene1 - 1e-10 40.0 0.0
+"""
+
+
+# --------------------------------------------------------------------------- #
+# Parsing
+# --------------------------------------------------------------------------- #
+def test_parse_tblout_skips_comments():
+ hits = parse_hmmscan_tblout(TBLOUT)
+ assert list(hits.columns) == ["ko", "gene", "evalue"]
+ assert len(hits) == 3
+ assert set(hits["ko"]) == {"K01194", "K00002"}
+ assert hits.iloc[0]["evalue"] == 1e-120
+
+
+def test_parse_tblout_empty():
+ assert parse_hmmscan_tblout("# only a header\n").empty
+
+
+# --------------------------------------------------------------------------- #
+# assign_kos scoring/filters
+# --------------------------------------------------------------------------- #
+def test_cutoff_excludes_weak_hits():
+ hits = parse_hmmscan_tblout(TBLOUT)
+ # gene1->K00002 has evalue 1e-10, above the default cutoff 1e-30: dropped.
+ assigned = assign_kos(hits)
+ assert "K00002" not in assigned
+ assert set(assigned["K01194"]) == {"gene1", "gene2"}
+
+
+def test_loose_cutoff_keeps_hit():
+ hits = parse_hmmscan_tblout(TBLOUT)
+ assigned = assign_kos(hits, cutoff=1e-5, min_score_ratio_g=0.0, min_score_ratio_ko=0.0)
+ assert assigned.get("K00002") == ["gene1"]
+
+
+def test_min_score_ratio_ko_prunes_weak_member():
+ # In one KO: best 1e-200, weak 1e-20. log(1e-20)/log(1e-200)=0.1 < 0.3 -> pruned.
+ hits = pd.DataFrame(
+ [("K1", "strong", 1e-200), ("K1", "weak", 1e-20)],
+ columns=["ko", "gene", "evalue"],
+ )
+ assigned = assign_kos(hits, cutoff=1e-5, min_score_ratio_ko=0.3, min_score_ratio_g=0.0)
+ assert assigned["K1"] == ["strong"]
+
+
+def test_min_score_ratio_g_keeps_gene_in_best_ko_only():
+ # gene g hits K1 strongly (1e-200) and K2 weakly (1e-20).
+ # For the gene: log(1e-20)/log(1e-200)=0.1 < 0.8 -> K2 assignment dropped.
+ hits = pd.DataFrame(
+ [("K1", "g", 1e-200), ("K2", "g", 1e-20)],
+ columns=["ko", "gene", "evalue"],
+ )
+ assigned = assign_kos(hits, cutoff=1e-5, min_score_ratio_ko=0.0, min_score_ratio_g=0.8)
+ assert assigned == {"K1": ["g"]}
+
+
+def test_zero_evalue_does_not_crash():
+ hits = pd.DataFrame([("K1", "g", 0.0)], columns=["ko", "gene", "evalue"])
+ assert assign_kos(hits) == {"K1": ["g"]}
+
+
+def test_cutoff_ge_one_rejected():
+ """cutoff >= 1 would let log(best_evalue)=0 through and ZeroDivisionError later
+ (known_issues.md A6). Reject up front with a clear message."""
+ hits = pd.DataFrame([("K1", "g", 0.5)], columns=["ko", "gene", "evalue"])
+ with pytest.raises(ValueError, match="cutoff must be < 1"):
+ assign_kos(hits, cutoff=1.0)
+
+
+# --------------------------------------------------------------------------- #
+# Model assembly via the HMM path (hmmscan mocked)
+# --------------------------------------------------------------------------- #
+@pytest.fixture(scope="module")
+def reference_and_tables():
+ reactions = parse_kegg_reactions(DUMP)
+ compounds = parse_kegg_compounds(DUMP)
+ linked = {ko for r in reactions for ko in r.kos}
+ kos = parse_kegg_kos(DUMP, keep=linked)
+ return build_reference_model(reactions, compounds), build_kegg_tables(reactions, kos)
+
+
+def test_get_model_from_sequences(reference_and_tables, monkeypatch):
+ model_ref, tables = reference_and_tables
+ # Mock the HMM search: K01194 -> myGeneA/myGeneB (-> R00010).
+ monkeypatch.setattr(
+ "raven_python.reconstruction.kegg.query.run_hmmscan",
+ lambda *a, **k: (
+ "K01194 - myGeneA - 1e-120 400 0\n"
+ "K01194 - myGeneB - 1e-110 380 0\n"
+ ),
+ )
+ model = get_kegg_model_from_sequences(
+ "ignored.fasta",
+ model_ref,
+ tables["ko_reaction"],
+ "ignored.hmm",
+ rxn_flags=tables["rxn_flags"],
+ model_id="myorg",
+ )
+ assert model.id == "myorg"
+ r = model.reactions.get_by_id("R00010")
+ assert set(r.gene_reaction_rule.split(" or ")) == {"myGeneA", "myGeneB"}
+ assert r.notes["note"].endswith("(using HMMs)")
+ # R00200/R00300 had no matched KOs and are not spontaneous -> absent.
+ assert "R00200" not in model.reactions
diff --git a/tests/test_scripts_registry.py b/tests/test_scripts_registry.py
new file mode 100644
index 0000000..c9c03cf
--- /dev/null
+++ b/tests/test_scripts_registry.py
@@ -0,0 +1,58 @@
+"""Tests for scripts/make_registry_snippet.py registry-entry helpers."""
+import hashlib
+import importlib.util
+import json
+from pathlib import Path
+
+import pytest
+
+# scripts/ is not a package; load the module directly by path.
+_SCRIPT = Path(__file__).resolve().parents[1] / "scripts" / "make_registry_snippet.py"
+_spec = importlib.util.spec_from_file_location("make_registry_snippet", _SCRIPT)
+mrs = importlib.util.module_from_spec(_spec)
+_spec.loader.exec_module(mrs)
+
+
+def _sha(data: bytes) -> str:
+ return hashlib.sha256(data).hexdigest()
+
+
+def test_data_entry_lists_files_with_urls_and_checksums(tmp_path):
+ (tmp_path / "reference_model.yml.gz").write_bytes(b"model")
+ (tmp_path / "ko_reaction.tsv.gz").write_bytes(b"table")
+ (tmp_path / ".hidden").write_bytes(b"skip") # hidden files ignored
+
+ entry = mrs.data_entry("kegg", "kegg116", "https://x/rel/", tmp_path)
+ assert entry["version"] == "kegg116"
+ assert set(entry["files"]) == {"reference_model.yml.gz", "ko_reaction.tsv.gz"}
+ ref = entry["files"]["reference_model.yml.gz"]
+ assert ref["url"] == "https://x/rel/reference_model.yml.gz" # trailing slash collapsed
+ assert ref["sha256"] == _sha(b"model")
+
+
+def test_data_entry_empty_dir_errors(tmp_path):
+ with pytest.raises(SystemExit):
+ mrs.data_entry("kegg", "v1", "https://x", tmp_path)
+
+
+def test_binary_entry_parses_platform_from_filename(tmp_path):
+ (tmp_path / "blast-2.16.0-linux-x86_64.zip").write_bytes(b"linux")
+ (tmp_path / "blast-2.16.0-macos-arm64.zip").write_bytes(b"mac")
+ (tmp_path / "other-1.0-linux-x86_64.zip").write_bytes(b"nope") # different bundle
+
+ entry = mrs.binary_entry("blast", "2.16.0", ["blastp", "makeblastdb"], "https://x", tmp_path)
+ assert entry["provides"] == ["blastp", "makeblastdb"]
+ assert set(entry["platforms"]) == {"linux-x86_64", "macos-arm64"}
+ assert entry["platforms"]["macos-arm64"]["sha256"] == _sha(b"mac")
+ assert entry["platforms"]["linux-x86_64"]["url"].endswith("blast-2.16.0-linux-x86_64.zip")
+
+
+def test_binary_entry_no_zips_errors(tmp_path):
+ with pytest.raises(SystemExit):
+ mrs.binary_entry("blast", "2.16.0", ["blastp"], "https://x", tmp_path)
+
+
+def test_render_is_valid_json_round_trip():
+ entry = {"version": "v1", "files": {"a": {"url": "u", "sha256": "s"}}}
+ text = mrs.render("kegg", entry)
+ assert json.loads(text) == {"kegg": entry}
diff --git a/tests/test_tasks.py b/tests/test_tasks.py
new file mode 100644
index 0000000..1b5e6bd
--- /dev/null
+++ b/tests/test_tasks.py
@@ -0,0 +1,189 @@
+"""Tests for metabolic tasks (Phase 4a): parse_task_list + check_tasks."""
+import cobra
+import pytest
+
+from raven_python.tasks import Task, check_tasks, parse_task_list
+
+TASK_TSV = (
+ "ID\tDESCRIPTION\tIN\tIN UB\tOUT\tOUT LB\tEQU\tSHOULD FAIL\n"
+ "T1\tgrowth\tglc[e];o2[e]\t10\tbio[c]\t1\t\t\n"
+ "T2\tinfeasible\t\t\tatp[c]\t1\t\ttrue\n"
+ "\t\t\t\tnadh[c]\t\t\t\n"
+ "T3\twithequ\tA[c]\t\tB[c]\t\tA[c] <=> B[c]\t\n"
+)
+
+
+# --------------------------------------------------------------------------- #
+# parse_task_list
+# --------------------------------------------------------------------------- #
+@pytest.fixture
+def task_file(tmp_path):
+ p = tmp_path / "tasks.txt"
+ p.write_text(TASK_TSV)
+ return p
+
+
+def test_parse_basic_and_defaults(task_file):
+ tasks = parse_task_list(task_file)
+ assert [t.id for t in tasks] == ["T1", "T2", "T3"]
+ t1 = tasks[0]
+ assert t1.description == "growth"
+ # ';' splits mets sharing the row's bounds; IN LB defaults 0, IN UB from cell.
+ assert t1.inputs == [("glc[e]", 0.0, 10.0), ("o2[e]", 0.0, 10.0)]
+ assert t1.outputs == [("bio[c]", 1.0, 1000.0)] # OUT UB defaults 1000
+
+
+def test_parse_should_fail_and_continuation(task_file):
+ t2 = parse_task_list(task_file)[1]
+ assert t2.should_fail is True
+ # continuation row (empty ID) appends nadh[c] to the same task's outputs
+ assert t2.outputs == [("atp[c]", 1.0, 1000.0), ("nadh[c]", 0.0, 1000.0)]
+
+
+def test_parse_equation_default_bounds(task_file):
+ t3 = parse_task_list(task_file)[2]
+ # reversible '<=>' -> EQU LB defaults -1000, UB 1000
+ assert t3.equations == [("A[c] <=> B[c]", -1000.0, 1000.0)]
+
+
+def test_parse_missing_id_column(tmp_path):
+ p = tmp_path / "bad.txt"
+ p.write_text("FOO\tBAR\nx\ty\n")
+ with pytest.raises(ValueError, match="ID"):
+ parse_task_list(p)
+
+
+def test_parse_warns_on_data_row_before_first_id(tmp_path):
+ """known_issues.md B3: continuation rows appearing before the first task ID
+ used to be silently dropped. Now warns so the user sees the malformed file."""
+ p = tmp_path / "orphan.txt"
+ p.write_text(
+ "ID\tDESCRIPTION\tIN\tIN UB\tOUT\tOUT UB\tSHOULD FAIL\n"
+ "\t\tglc[e]\t10\t\t\t\n" # orphan data row, no ID seen yet
+ "T1\tgrowth\t\t\tbio[c]\t1\t\n"
+ )
+ with pytest.warns(UserWarning, match="no task ID has been seen yet"):
+ tasks = parse_task_list(p)
+ assert [t.id for t in tasks] == ["T1"]
+ # The orphan row's data isn't grafted onto T1 either.
+ assert tasks[0].inputs == []
+
+
+def test_parse_task_list_xlsx_missing_tasks_sheet(tmp_path):
+ """A .xlsx without a 'TASKS' sheet used to raise a bare KeyError; now
+ raises a clear ValueError naming the actual sheets (known_issues.md C3)."""
+ pytest.importorskip("openpyxl")
+ from openpyxl import Workbook
+
+ wb = Workbook()
+ wb.active.title = "NotTasks"
+ p = tmp_path / "wrong.xlsx"
+ wb.save(p)
+ with pytest.raises(ValueError, match="no sheet named 'TASKS'"):
+ parse_task_list(p)
+
+
+# --------------------------------------------------------------------------- #
+# check_tasks
+# --------------------------------------------------------------------------- #
+def _met(mid, name, comp="c"):
+ return cobra.Metabolite(mid, name=name, compartment=comp)
+
+
+@pytest.fixture
+def model():
+ """Closed model: A -> B (r1); D present but unproduced."""
+ m = cobra.Model("t")
+ A, B, D = _met("A_c", "A"), _met("B_c", "B"), _met("D_c", "D")
+ m.add_metabolites([A, B, D])
+ r1 = cobra.Reaction("r1", lower_bound=0, upper_bound=1000)
+ r1.add_metabolites({A: -1, B: 1})
+ m.add_reactions([r1])
+ return m
+
+
+def _by_id(results):
+ return {r.id: r for r in results}
+
+
+def test_feasible_task_passes(model):
+ # OUT LB=1 requires producing B (LB=0 would pass trivially via zero flux).
+ task = Task("make_B", inputs=[("A[c]", 0, 1000)], outputs=[("B[c]", 1, 1000)])
+ (res,) = check_tasks(model, [task])
+ assert res.feasible and res.passed
+
+
+def test_should_fail_task_passes_when_infeasible(model):
+ # Require producing B with no input -> infeasible -> should_fail makes it pass.
+ task = Task("no_input", outputs=[("B[c]", 1, 1000)], should_fail=True)
+ (res,) = check_tasks(model, [task])
+ assert not res.feasible and res.passed
+
+
+def test_unsatisfiable_task_fails(model):
+ task = Task("need_B", outputs=[("B[c]", 1, 1000)]) # no input, not should_fail
+ (res,) = check_tasks(model, [task])
+ assert not res.feasible and not res.passed
+
+
+def test_equation_adds_pathway(model):
+ # Model can't make D; the task's extra reaction B -> D enables output of D.
+ task = Task(
+ "make_D",
+ inputs=[("A[c]", 0, 1000)],
+ outputs=[("D[c]", 1, 1000)],
+ equations=[("B[c] => D[c]", 0.0, 1000.0)],
+ )
+ (res,) = check_tasks(model, [task])
+ assert res.passed
+ # without the extra reaction D cannot be made
+ (res2,) = check_tasks(model, [Task("make_D2", inputs=[("A[c]", 0, 1000)], outputs=[("D[c]", 1, 1000)])])
+ assert not res2.passed
+
+
+def test_changed_bounds_block_reaction(model):
+ # Blocking r1 makes B unproducible.
+ task = Task(
+ "block_r1",
+ inputs=[("A[c]", 0, 1000)],
+ outputs=[("B[c]", 1, 1000)],
+ changed=[("r1", 0.0, 0.0)],
+ )
+ (res,) = check_tasks(model, [task])
+ assert not res.passed
+
+
+def test_allmets_output(model):
+ # Force uptake of A (IN LB=1); the only fate is A->B, so B must be excreted.
+ # ALLMETS output permits that, making the task feasible; without it B accumulates.
+ task = Task("sink_all", inputs=[("A[c]", 1, 1000)], outputs=[("ALLMETS", 0, 1000)])
+ (res,) = check_tasks(model, [task])
+ assert res.passed
+ (res2,) = check_tasks(model, [Task("forced_no_out", inputs=[("A[c]", 1, 1000)])])
+ assert not res2.passed # forced A uptake but nowhere for B to go
+
+
+def test_unknown_metabolite_reported(model):
+ task = Task("typo", inputs=[("Z[c]", 0, 1000)], outputs=[("B[c]", 0, 1000)])
+ (res,) = check_tasks(model, [task])
+ assert not res.passed and "unknown metabolite" in res.error
+
+
+def test_open_exchange_is_closed_so_task_controls_io(model):
+ # An open demand for B would let B leave for free; check_tasks closes it, so a
+ # task with no output for B and a forced... here: B has an open sink, but the
+ # task defines only input A and no output -> B must still balance (sink closed).
+ model.add_boundary(model.metabolites.B_c, type="sink") # open B sink
+ task = Task("need_D_out", inputs=[("A[c]", 0, 1000)], outputs=[("D[c]", 1, 1000)])
+ (res,) = check_tasks(model, [task])
+ assert not res.passed # D still cannot be produced despite the (now-closed) B sink
+
+
+def test_check_tasks_accepts_a_file_path(model, tmp_path):
+ p = tmp_path / "t.txt"
+ p.write_text(
+ "ID\tDESCRIPTION\tIN\tOUT\tOUT LB\n"
+ "make_B\tconvert\tA[c]\tB[c]\t1\n"
+ )
+ results = check_tasks(model, p) # path, parsed internally
+ assert _by_id(results)["make_B"].passed
diff --git a/tests/test_tasks_essential.py b/tests/test_tasks_essential.py
new file mode 100644
index 0000000..5352378
--- /dev/null
+++ b/tests/test_tasks_essential.py
@@ -0,0 +1,114 @@
+"""Phase 4d.1: essential-reaction discovery for tasks (find_task_essential_reactions).
+
+Oracle: RAVEN tinitTests T0002 — for testModel + the "make e[s] from a[s]" task, the
+pre-merge essential reactions are R2 (the only a[s]<->a[c] link) and R7 (the only
+e[c]->e[s] producer); the alternative internal paths make nothing else essential.
+"""
+import cobra
+from tinit_oracles import (
+ TEST_MODEL_TASK_ESSENTIAL_PREMERGE,
+ make_test_model,
+ make_test_task,
+)
+
+from raven_python.tasks import (
+ EssentialReactionsResult,
+ Task,
+ find_task_essential_reactions,
+)
+
+
+def test_essential_reactions_match_oracle():
+ res = find_task_essential_reactions(make_test_model(), [make_test_task()])
+ assert isinstance(res, EssentialReactionsResult)
+ assert sorted(res.reactions) == TEST_MODEL_TASK_ESSENTIAL_PREMERGE # ['R2', 'R7']
+ assert not res.failed_tasks
+
+
+def test_essential_directions_are_forward():
+ """R2 (a[s]->a[c]) and R7 (e[c]->e[s]) both carry positive flux for this task."""
+ res = find_task_essential_reactions(make_test_model(), [make_test_task()])
+ assert res.reactions == {"R2": 1, "R7": 1}
+
+
+def test_task_metabolites_collected():
+ """a[s] and e[s] are referenced by the task and must be protected from removal."""
+ res = find_task_essential_reactions(make_test_model(), [make_test_task()])
+ m = make_test_model()
+ names = {res_id: f"{m.metabolites.get_by_id(res_id).name}"
+ f"[{m.metabolites.get_by_id(res_id).compartment}]" for res_id in res.task_metabolites}
+ assert set(names.values()) == {"a[s]", "e[s]"}
+
+
+def test_no_task_no_essentials():
+ res = find_task_essential_reactions(make_test_model(), [])
+ assert res.reactions == {} and res.per_task == {}
+
+
+def test_equation_metabolites_are_protected():
+ """A task equation's metabolites count as task metabolites (protected from removal)."""
+ m = make_test_model()
+ task = Task(
+ id="equ",
+ inputs=[("a[s]", 0.0, 1000.0)],
+ outputs=[("e[c]", 1.0, 1.0)],
+ equations=[("a[c] => e[c]", 0.0, 1000.0)], # references a[c], which is not an I/O met
+ )
+ res = find_task_essential_reactions(m, [task])
+ names = {f"{m.metabolites.get_by_id(i).name}[{m.metabolites.get_by_id(i).compartment}]"
+ for i in res.task_metabolites}
+ assert {"a[c]", "e[c]"} <= names and "equ" not in res.failed_tasks
+
+
+def test_infeasible_task_is_reported_failed():
+ """A task requiring an impossible output is dropped, not crashed."""
+ impossible = Task(id="bad", outputs=[("z[s]", 1.0, 1.0)])
+ # z[s] doesn't exist -> unknown metabolite -> failed.
+ res = find_task_essential_reactions(make_test_model(), [impossible])
+ assert res.failed_tasks == ["bad"] and res.reactions == {}
+
+
+def test_should_fail_task_defines_no_essentials():
+ res = find_task_essential_reactions(
+ make_test_model(), [Task(id="sf", should_fail=True, outputs=[("e[s]", 1.0, 1.0)])]
+ )
+ assert res.reactions == {} and "sf" not in res.per_task
+
+
+def test_direction_majority_across_tasks():
+ """A reaction essential reverse in two tasks and forward in one is recorded reverse."""
+ # Build a tiny model where a single reaction must run in a chosen direction.
+ m = cobra.Model("dir")
+ a, b = (cobra.Metabolite(x, name=x, compartment="s") for x in "ab")
+ m.add_metabolites([a, b])
+ r = cobra.Reaction("REV", lower_bound=-1000, upper_bound=1000)
+ r.add_metabolites({a: -1, b: 1}) # a <=> b
+ m.add_reactions([r])
+ m.objective = "REV"
+ # Task forcing net production of b from a -> REV forward (+1).
+ fwd = Task(id="fwd", inputs=[("a[s]", 0.0, 1000.0)], outputs=[("b[s]", 1.0, 1.0)])
+ # Two tasks forcing net production of a from b -> REV reverse (-1). Distinct ids
+ # (task lists have unique ids; essential discovery de-duplicates by id).
+ rev1 = Task(id="rev1", inputs=[("b[s]", 0.0, 1000.0)], outputs=[("a[s]", 1.0, 1.0)])
+ rev2 = Task(id="rev2", inputs=[("b[s]", 0.0, 1000.0)], outputs=[("a[s]", 1.0, 1.0)])
+ res = find_task_essential_reactions(m, [rev1, rev2, fwd])
+ assert res.reactions["REV"] == -1 # two reverse votes beat one forward
+
+
+def test_duplicate_name_comp_metabolites_both_constrained():
+ """A task referencing a name[comp] shared by two metabolites resolves (not 'missing')."""
+ m = cobra.Model("dup")
+ # Two distinct metabolites with the SAME name and compartment.
+ a1 = cobra.Metabolite("a1", name="a", compartment="s")
+ a2 = cobra.Metabolite("a2", name="a", compartment="s")
+ b = cobra.Metabolite("b", name="b", compartment="s")
+ m.add_metabolites([a1, a2, b])
+ r1 = cobra.Reaction("R1", lower_bound=0, upper_bound=1000)
+ r1.add_metabolites({a1: -1, b: 1}) # only a1 feeds b
+ m.add_reactions([r1])
+ m.objective = "R1"
+ # Output b from input a -> 'a[s]' matches both a1 and a2; must not be reported missing.
+ task = Task(id="t", inputs=[("a[s]", 0.0, 1000.0)], outputs=[("b[s]", 1.0, 1.0)])
+ res = find_task_essential_reactions(m, [task])
+ assert res.failed_tasks == [] # 'a[s]' resolved (to both a1 and a2), task feasible
+ assert "R1" in res.reactions
diff --git a/tests/test_utils_balance.py b/tests/test_utils_balance.py
new file mode 100644
index 0000000..aa1e47e
--- /dev/null
+++ b/tests/test_utils_balance.py
@@ -0,0 +1,76 @@
+"""Tests for get_elemental_balance (getElementalBalance port)."""
+import cobra
+import pytest
+
+from raven_python.utils import ElementalBalance, get_elemental_balance
+
+
+@pytest.fixture
+def model():
+ m = cobra.Model("t")
+ m.add_metabolites(
+ [
+ cobra.Metabolite("a_c", formula="C6H12O6", charge=0, compartment="c"),
+ cobra.Metabolite("b_c", formula="C6H12O6", charge=0, compartment="c"),
+ cobra.Metabolite("c_c", formula="C3H6O3", charge=0, compartment="c"),
+ cobra.Metabolite("n_c", compartment="c"), # no formula
+ ]
+ )
+ r_bal = cobra.Reaction("R_bal")
+ m.add_reactions([r_bal])
+ r_bal.build_reaction_from_string("a_c --> b_c") # C6H12O6 -> C6H12O6
+ r_unbal = cobra.Reaction("R_unbal")
+ m.add_reactions([r_unbal])
+ r_unbal.build_reaction_from_string("a_c --> c_c") # C6H12O6 -> C3H6O3
+ r_unknown = cobra.Reaction("R_unknown")
+ m.add_reactions([r_unknown])
+ r_unknown.build_reaction_from_string("a_c --> n_c") # n_c has no formula
+ return m
+
+
+def test_balanced(model):
+ (res,) = get_elemental_balance(model, ["R_bal"])
+ assert res == ElementalBalance("R_bal", "balanced", {})
+
+
+def test_unbalanced_reports_imbalance(model):
+ (res,) = get_elemental_balance(model, ["R_unbal"])
+ assert res.status == "unbalanced"
+ # products - reactants: C3H6O3 - C6H12O6 = -C3H6O3
+ assert res.imbalance == {"C": -3.0, "H": -6.0, "O": -3.0}
+
+
+def test_missing_formula_is_unknown_not_silently_wrong(model):
+ # cobra's check_mass_balance alone would silently report an imbalance here;
+ # we flag it as unknown instead.
+ (res,) = get_elemental_balance(model, ["R_unknown"])
+ assert res.status == "unknown"
+ assert res.imbalance == {}
+
+
+def test_all_reactions_default(model):
+ results = get_elemental_balance(model)
+ assert {r.reaction_id: r.status for r in results} == {
+ "R_bal": "balanced",
+ "R_unbal": "unbalanced",
+ "R_unknown": "unknown",
+ }
+
+
+def test_charge_excluded(model):
+ # give a charge imbalance but keep elements balanced -> still "balanced"
+ model.metabolites.get_by_id("b_c").charge = 1
+ (res,) = get_elemental_balance(model, ["R_bal"])
+ assert res.status == "balanced"
+
+
+# --- regression: empty reaction → unknown (known_issues.md F5) -------------
+
+def test_empty_reaction_is_unknown(model):
+ """A reaction with no metabolites used to be reported `balanced`
+ vacuously (any() over an empty list is False and check_mass_balance
+ returns no imbalance). Now reports `unknown`."""
+ empty = cobra.Reaction("R_empty", lower_bound=0, upper_bound=1000)
+ model.add_reactions([empty])
+ (res,) = get_elemental_balance(model, ["R_empty"])
+ assert res.status == "unknown"
diff --git a/tests/test_utils_gpr.py b/tests/test_utils_gpr.py
new file mode 100644
index 0000000..275d020
--- /dev/null
+++ b/tests/test_utils_gpr.py
@@ -0,0 +1,84 @@
+"""Tests for raven_python.utils.gpr (GPR linting)."""
+import cobra
+import pytest
+
+from raven_python.utils import GPRIssue, find_non_dnf_grrules, is_dnf
+
+
+@pytest.mark.parametrize(
+ "rule",
+ [
+ "",
+ "G1",
+ "G1 and G2",
+ "G1 or G2",
+ "G1 and G2 and G3",
+ "G1 or G2 or G3",
+ "(G1 and G2) or G3",
+ "(G1 and G2) or (G3 and G4)",
+ "G1 or (G2 and G3)",
+ ],
+)
+def test_is_dnf_true(rule):
+ assert is_dnf(rule) is True
+
+
+@pytest.mark.parametrize(
+ "rule",
+ [
+ "(G1 or G2) and G3",
+ "G1 and (G2 or G3)",
+ "(G1 or G2) and (G3 or G4)",
+ "G1 and (G2 or (G3 and G4))",
+ ],
+)
+def test_is_dnf_false(rule):
+ assert is_dnf(rule) is False
+
+
+def test_is_dnf_accepts_gpr_and_none():
+ from cobra.core.gene import GPR
+
+ assert is_dnf(GPR.from_string("(G1 or G2) and G3")) is False
+ assert is_dnf(GPR.from_string("G1 or G2")) is True
+ assert is_dnf(None) is True
+
+
+def test_is_dnf_independent_of_formatting():
+ # cobra normalises on assignment, so casing/whitespace cannot change the verdict.
+ assert is_dnf("(G1 OR G2) AND G3") is False
+ assert is_dnf("( G1 and G2 ) or G3") is True
+
+
+def _model_with_rules(rules: dict[str, str]) -> cobra.Model:
+ model = cobra.Model("t")
+ model.add_reactions([cobra.Reaction(rid) for rid in rules])
+ for rid, rule in rules.items():
+ model.reactions.get_by_id(rid).gene_reaction_rule = rule
+ return model
+
+
+def test_find_non_dnf_grrules_flags_only_offenders():
+ model = _model_with_rules(
+ {
+ "R_ok_single": "G1",
+ "R_ok_complex": "G1 and G2",
+ "R_ok_dnf": "(G1 and G2) or G3",
+ "R_no_gpr": "",
+ "R_bad_1": "(G1 or G2) and G3",
+ "R_bad_2": "(G1 or G2) and (G3 or G4)",
+ }
+ )
+
+ issues = find_non_dnf_grrules(model)
+
+ assert [i.reaction_id for i in issues] == ["R_bad_1", "R_bad_2"]
+ assert all(isinstance(i, GPRIssue) for i in issues)
+ assert all("disjunctive normal form" in i.reason for i in issues)
+ # the reported GPR is the cobra-normalised string
+ assert issues[0].gpr == "(G1 or G2) and G3"
+
+
+def test_find_non_dnf_grrules_empty_when_all_clean():
+ model = _model_with_rules({"R1": "G1 or G2", "R2": "(G1 and G2) or G3"})
+ assert find_non_dnf_grrules(model) == []
diff --git a/tests/test_utils_sort.py b/tests/test_utils_sort.py
new file mode 100644
index 0000000..18bca24
--- /dev/null
+++ b/tests/test_utils_sort.py
@@ -0,0 +1,42 @@
+"""Tests for sort_identifiers and write_yaml_model(sort_ids=True)."""
+import cobra
+
+from raven_python.io import read_yaml_model, write_yaml_model
+from raven_python.manipulation import add_reactions_from_equations
+from raven_python.utils import sort_identifiers
+
+
+def _model():
+ m = cobra.Model("t")
+ m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("b_c", "a_c")])
+ add_reactions_from_equations(
+ m,
+ [
+ {"id": "R2", "equation": "a_c --> b_c", "gene_reaction_rule": "GB"},
+ {"id": "R1", "equation": "b_c --> a_c", "gene_reaction_rule": "GA"},
+ ],
+ )
+ return m
+
+
+def test_sort_identifiers_orders_everything():
+ m = _model()
+ sort_identifiers(m)
+ assert [r.id for r in m.reactions] == ["R1", "R2"]
+ assert [x.id for x in m.metabolites] == ["a_c", "b_c"]
+ assert [g.id for g in m.genes] == ["GA", "GB"]
+ # lookup index still intact after sorting
+ assert m.reactions.get_by_id("R2").id == "R2"
+
+
+def test_write_yaml_sort_ids_does_not_mutate(tmp_path):
+ m = _model()
+ order_before = [r.id for r in m.reactions]
+ out = tmp_path / "m.yml"
+ write_yaml_model(m, out, sort_ids=True)
+ assert [r.id for r in m.reactions] == order_before # model untouched
+ # but the file is sorted
+ text = out.read_text()
+ assert text.index("R1") < text.index("R2")
+ reloaded = read_yaml_model(out)
+ assert [r.id for r in reloaded.reactions] == ["R1", "R2"]
diff --git a/tests/test_utils_validate.py b/tests/test_utils_validate.py
new file mode 100644
index 0000000..2d38e6f
--- /dev/null
+++ b/tests/test_utils_validate.py
@@ -0,0 +1,80 @@
+"""Tests for check_model (the surviving checks of checkModelStruct)."""
+import cobra
+import pytest
+
+from raven_python.manipulation import add_reactions_from_equations
+from raven_python.utils import ModelIssue, check_model
+
+
+def _categories(issues, category):
+ return [i.object_id for i in issues if i.category == category]
+
+
+@pytest.fixture
+def model():
+ m = cobra.Model("t")
+ m.add_metabolites(
+ [
+ cobra.Metabolite("a_c", name="A", compartment="c"),
+ cobra.Metabolite("b_c", name="B", compartment="c"),
+ ]
+ )
+ add_reactions_from_equations(
+ m, [{"id": "R1", "equation": "a_c --> b_c", "gene_reaction_rule": "G1"}]
+ )
+ m.reactions.get_by_id("R1").objective_coefficient = 1
+ return m
+
+
+def test_clean_model_has_no_issues(model):
+ assert check_model(model) == []
+
+
+def test_orphan_metabolite(model):
+ model.add_metabolites([cobra.Metabolite("orphan_c", name="Orphan", compartment="c")])
+ assert "orphan_c" in _categories(check_model(model), "orphan_metabolite")
+
+
+def test_orphan_gene(model):
+ model.genes.append(cobra.core.gene.Gene("G_lonely"))
+ assert "G_lonely" in _categories(check_model(model), "orphan_gene")
+
+
+def test_empty_reaction(model):
+ model.add_reactions([cobra.Reaction("R_empty")])
+ assert "R_empty" in _categories(check_model(model), "empty_reaction")
+
+
+def test_empty_metabolite_name(model):
+ model.add_metabolites([cobra.Metabolite("noname_c", compartment="c")])
+ # also an orphan, but we check the name category specifically
+ assert "noname_c" in _categories(check_model(model), "empty_metabolite_name")
+
+
+def test_duplicate_name_compartment(model):
+ # second metabolite named "A" in compartment c
+ dup = cobra.Metabolite("a2_c", name="A", compartment="c")
+ model.add_metabolites([dup])
+ model.reactions.get_by_id("R1").add_metabolites({dup: -1}) # keep it used
+ issues = [i for i in check_model(model) if i.category == "duplicate_name_compartment"]
+ assert len(issues) == 1
+ assert "a_c" in issues[0].message and "a2_c" in issues[0].message
+
+
+def test_no_objective(model):
+ model.reactions.get_by_id("R1").objective_coefficient = 0
+ cats = [i.category for i in check_model(model)]
+ assert "objective" in cats
+
+
+def test_multiple_objectives(model):
+ add_reactions_from_equations(model, [{"id": "R2", "equation": "b_c --> a_c"}])
+ model.reactions.get_by_id("R2").objective_coefficient = 1
+ obj_issues = [i for i in check_model(model) if i.category == "objective"]
+ assert len(obj_issues) == 1
+ assert "Multiple" in obj_issues[0].message
+
+
+def test_returns_model_issue_instances(model):
+ model.add_reactions([cobra.Reaction("R_empty")])
+ assert all(isinstance(i, ModelIssue) for i in check_model(model))
diff --git a/tests/tinit_oracles.py b/tests/tinit_oracles.py
new file mode 100644
index 0000000..638956d
--- /dev/null
+++ b/tests/tinit_oracles.py
@@ -0,0 +1,161 @@
+"""Shared (ft)INIT test oracles, ported from RAVEN's ``tinitTests.m``.
+
+These toy models have **defined reaction scores** and **known ftINIT outputs**, so
+they serve as exact correctness oracles for the Phase 4d port (see
+docs/ftinit_review_and_plan.md). Building them here once lets every sub-phase
+(essential-reaction discovery, the MILP, linear merge, staging) check against the
+same RAVEN-verified answers.
+
+Reaction scores are injected through gene expression using :func:`expr_for_rxn_score`
+(RAVEN's ``getExprForRxnScore``): each toy reaction ``Ri`` has at most one gene
+``Gi``, so an expression of ``exp(score_i/5)`` reproduces the desired score exactly
+(no-gene reactions get ``no_gene_score = -2`` regardless).
+"""
+from __future__ import annotations
+
+import math
+
+import cobra
+
+
+def expr_for_rxn_score(scores, threshold: float = 1.0) -> dict:
+ """RAVEN ``getExprForRxnScore``: gene expression giving a target single-gene score.
+
+ Inverts ``score = 5·ln(level/threshold)`` → ``level = threshold·exp(score/5)``.
+ Returns ``{Gi: level}`` for i = 1..len(scores) (gene name ``"G{i}"``), mirroring the
+ 1-reaction-1-gene layout of the toy models.
+ """
+ return {f"G{i + 1}": threshold * math.exp(s / 5) for i, s in enumerate(scores)}
+
+
+def _build(model_id, mets, reactions, objective):
+ """mets: {id: (name, compartment)}; reactions: {id: (stoich, lb, ub, gpr)}."""
+ m = cobra.Model(model_id)
+ met_objs = {
+ mid: cobra.Metabolite(mid, name=name, compartment=comp)
+ for mid, (name, comp) in mets.items()
+ }
+ m.add_metabolites(list(met_objs.values()))
+ for rid, (stoich, lb, ub, gpr) in reactions.items():
+ r = cobra.Reaction(rid, lower_bound=lb, upper_bound=ub)
+ r.add_metabolites({met_objs[mid]: coeff for mid, coeff in stoich.items()})
+ m.add_reactions([r])
+ if gpr:
+ r.gene_reaction_rule = gpr
+ m.objective = objective
+ return m
+
+
+# --------------------------------------------------------------------------- #
+# testModel — RAVEN getTstModel(): 8 mets, 10 rxns. a[s] -> ... -> e[s] export.
+# --------------------------------------------------------------------------- #
+_TEST_METS = {
+ "as": ("a", "s"), "ac": ("a", "c"), "bc": ("b", "c"), "cc": ("c", "c"),
+ "dc": ("d", "c"), "ec": ("e", "c"), "es": ("e", "s"), "fc": ("f", "c"),
+}
+_TEST_RXNS = {
+ "R1": ({"as": 1}, 0, 1000, ""), # -> a[s] (exchange, no GPR)
+ "R2": ({"as": -1, "ac": 1}, -1000, 1000, ""), # a[s] <=> a[c] (transport, no GPR)
+ "R3": ({"ac": -1, "bc": 1, "cc": 1}, -1000, 1000, "G3"),
+ "R4": ({"ac": -1, "dc": 2}, -1000, 1000, "G4"),
+ "R5": ({"bc": -1, "cc": -1, "ec": 1}, 0, 1000, "G5"),
+ "R6": ({"dc": -2, "ec": 1}, 0, 1000, "G6"),
+ "R7": ({"ec": -1, "es": 1}, 0, 1000, "G7"), # transport, with GPR
+ "R8": ({"es": -1}, 0, 1000, ""), # e[s] -> (exchange, no GPR)
+ "R9": ({"ac": -1, "fc": 1}, -1000, 1000, "G9"),
+ "R10": ({"fc": -1, "ec": 1}, -1000, 1000, "G10"),
+}
+# RAVEN getTstModelRxnScores(), R1..R10.
+TEST_MODEL_SCORES = [-2, -2, -1, 7, 0.5, 0.5, -1, -2, -3, 3.5]
+
+
+def make_test_model() -> cobra.Model:
+ return _build("testModel", _TEST_METS, _TEST_RXNS, "R8")
+
+
+# Oracles (RAVEN tinitTests):
+# T0001 ftINIT, no tasks, default '1+1':
+TEST_MODEL_FTINIT_NO_TASKS = ["R1", "R4", "R6", "R8", "R9", "R10"]
+# T0001 with R7,R10 spontaneous:
+TEST_MODEL_FTINIT_SPONT_R7_R10 = ["R1", "R2", "R4", "R6", "R7", "R8"]
+# T0002 with task "gen e[s] from a[s]": essential rxns (pre-merge ids) and output:
+TEST_MODEL_TASK_ESSENTIAL_PREMERGE = ["R2", "R7"]
+TEST_MODEL_TASK_ESSENTIAL_MERGED = ["R1", "R7"]
+TEST_MODEL_FTINIT_WITH_TASK = ["R1", "R2", "R4", "R6", "R7", "R8", "R9", "R10"]
+# T0004 mergeLinear(testModel): merges {R1,R2},{R3,R5},{R4,R6},{R7,R8},{R9,R10}
+TEST_MODEL_GROUP_IDS = [1, 1, 2, 3, 2, 3, 4, 4, 5, 5]
+TEST_MODEL_MERGED_REV = [0, 0, 0, 0, 1]
+TEST_MODEL_MERGED_LB = [0, 0, 0, 0, -1000]
+# groupRxnScores with R1,R2,R8 zeroed (toIgnore): -> per merged group
+TEST_MODEL_GROUPED_SCORES = [0, -0.5, 7.5, -1, 0.5]
+
+
+# The task: generate e[s] from a[s] (RAVEN getTstModelTasks()).
+def make_test_task():
+ """RAVEN getTstModelTasks(): make e[s] from a[s]."""
+ from raven_python.tasks import Task
+
+ return Task(
+ id="Gen e[s] from a[s]",
+ description="Gen e[s] from a[s]",
+ inputs=[("a[s]", 0.0, math.inf)], # (token, LBin, UBin)
+ outputs=[("e[s]", 1.0, 1.0)], # (token, LBout, UBout)
+ )
+
+
+# --------------------------------------------------------------------------- #
+# testModel4 — RAVEN getTstModel4(): partial linear merges + flips.
+# --------------------------------------------------------------------------- #
+_TEST4_METS = {
+ "a": ("a", "s"), "b": ("b", "s"), "d": ("d", "s"), "e": ("e", "s"),
+ "f": ("f", "s"), "g": ("g", "s"), "h": ("h", "s"),
+}
+_TEST4_RXNS = {
+ "R1": ({"a": -1}, -1000, 1000, "G1"), # a[s] <=>
+ "R2": ({"a": -1, "b": 1}, 0, 1000, "G2"), # a[s] -> b[s]
+ "R3": ({"a": -1, "b": 1}, -1000, 1000, "G3"), # a[s] <=> b[s]
+ "R4": ({"b": -1}, 0, 1000, "G4"), # b[s] ->
+ "R5": ({"a": -5, "d": 5}, -1000, 1000, "G5"), # 5 a[s] <=> 5 d[s]
+ "R6": ({"e": -1, "d": 1}, -1000, 1000, "G6"), # e[s] <=> d[s]
+ "R7": ({"f": -1, "g": -1, "e": 1}, -1000, 1000, "G7"), # f[s]+g[s] <=> e[s]
+ "R8": ({"b": -1, "f": 1}, -1000, 1000, "G8"), # b[s] <=> f[s]
+ "R9": ({"h": -1, "g": 1}, -1000, 1000, "G9"), # h[s] <=> g[s]
+ "R10": ({"h": -1}, 0, 1000, "G10"), # h[s] ->
+ "R11": ({"e": -1, "g": 1}, 0, 1000, "G11"), # e[s] -> g[s]
+}
+TEST_MODEL4_SCORES = [-1, -1, 2, -1, 0.5, -2, 1, 1.3, -0.5, -0.4, 8]
+# T0004 mergeLinear(testModel4): merges {R5,R6},{R7,R8},{R9,R10}; rest unmerged.
+TEST_MODEL4_GROUP_IDS = [0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 0]
+TEST_MODEL4_MERGED_REV = [1, 0, 1, 0, 1, 1, 0, 0]
+TEST_MODEL4_REVERSED_RXNS = ["R6", "R9"] # flipped direction when made irreversible
+
+
+def make_test_model4() -> cobra.Model:
+ return _build("testModel4", _TEST4_METS, _TEST4_RXNS, "R4")
+
+
+# --------------------------------------------------------------------------- #
+# testModel5 — RAVEN getTstModel5(): testModel + an unmerged parallel path R11-R14.
+# --------------------------------------------------------------------------- #
+def make_test_model5() -> cobra.Model:
+ m = make_test_model()
+ m.id = "testModel5"
+ m.add_metabolites([cobra.Metabolite("gc", name="g", compartment="c")])
+ gc = m.metabolites.get_by_id("gc")
+ ac = m.metabolites.get_by_id("ac")
+ ec = m.metabolites.get_by_id("ec")
+ extra = {
+ "R11": ({ac: -1, gc: 1}, -1000, 1000, "G11"),
+ "R12": ({ac: -1, gc: 1}, -1000, 1000, "G12"),
+ "R13": ({gc: -1, ec: 1}, -1000, 1000, "G13"),
+ "R14": ({gc: -1, ec: 1}, -1000, 1000, "G14"),
+ }
+ for rid, (stoich, lb, ub, gpr) in extra.items():
+ r = cobra.Reaction(rid, lower_bound=lb, upper_bound=ub)
+ r.add_metabolites(stoich)
+ m.add_reactions([r])
+ r.gene_reaction_rule = gpr
+ return m
+
+
+TEST_MODEL5_SCORES = [*TEST_MODEL_SCORES, -1, -1.5, -1, -1.5]