diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..3f6d19b --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,48 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + workflow_dispatch: + +# A push that obsoletes a previous run cancels it. +concurrency: + group: ci-${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + lint: + name: ruff + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: pip + cache-dependency-path: pyproject.toml + - run: pip install --upgrade pip + - run: pip install ruff + - run: ruff check . + + test: + name: pytest (py${{ matrix.python }}) + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python: ["3.11", "3.12", "3.13"] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python }} + cache: pip + cache-dependency-path: pyproject.toml + - run: pip install --upgrade pip + # ``-e .[dev,plotting,excel]`` so every optional extra is exercised. + # Gurobi is not installable on free runners; the relevant tests + # skip themselves when ``optlang.gurobi_interface`` cannot import. + - run: pip install -e ".[dev,plotting,excel]" + - run: pytest -q --maxfail=5 --durations=20 diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..1f44cf3 --- /dev/null +++ b/LICENSE @@ -0,0 +1,541 @@ +GNU GENERAL PUBLIC LICENSE +========================== + +Version 3, 29 June 2007 + +Copyright © 2007 Free Software Foundation, Inc. <> + +Everyone is permitted to copy and distribute verbatim copies of this license +document, but changing it is not allowed. + +## Preamble + +The GNU General Public License is a free, copyleft license for software and other +kinds of works. + +The licenses for most software and other practical works are designed to take away +your freedom to share and change the works. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change all versions of a +program--to make sure it remains free software for all its users. We, the Free +Software Foundation, use the GNU General Public License for most of our software; it +applies also to any other work released this way by its authors. You can apply it to +your programs, too. + +When we speak of free software, we are referring to freedom, not price. Our General +Public Licenses are designed to make sure that you have the freedom to distribute +copies of free software (and charge for them if you wish), that you receive source +code or can get it if you want it, that you can change the software or use pieces of +it in new free programs, and that you know you can do these things. + +To protect your rights, we need to prevent others from denying you these rights or +asking you to surrender the rights. Therefore, you have certain responsibilities if +you distribute copies of the software, or if you modify it: responsibilities to +respect the freedom of others. + +For example, if you distribute copies of such a program, whether gratis or for a fee, +you must pass on to the recipients the same freedoms that you received. You must make +sure that they, too, receive or can get the source code. And you must show them these +terms so they know their rights. + +Developers that use the GNU GPL protect your rights with two steps: (1) assert +copyright on the software, and (2) offer you this License giving you legal permission +to copy, distribute and/or modify it. + +For the developers' and authors' protection, the GPL clearly explains that there is +no warranty for this free software. For both users' and authors' sake, the GPL +requires that modified versions be marked as changed, so that their problems will not +be attributed erroneously to authors of previous versions. + +Some devices are designed to deny users access to install or run modified versions of +the software inside them, although the manufacturer can do so. This is fundamentally +incompatible with the aim of protecting users' freedom to change the software. The +systematic pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we have designed +this version of the GPL to prohibit the practice for those products. If such problems +arise substantially in other domains, we stand ready to extend this provision to +those domains in future versions of the GPL, as needed to protect the freedom of +users. + +Finally, every program is threatened constantly by software patents. States should +not allow patents to restrict development and use of software on general-purpose +computers, but in those that do, we wish to avoid the special danger that patents +applied to a free program could make it effectively proprietary. To prevent this, the +GPL assures that patents cannot be used to render the program non-free. + +The precise terms and conditions for copying, distribution and modification follow. + +## TERMS AND CONDITIONS + +### 0. Definitions. + +“This License” refers to version 3 of the GNU General Public License. + +“Copyright” also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + +“The Program” refers to any copyrightable work licensed under this +License. Each licensee is addressed as “you”. “Licensees” and +“recipients” may be individuals or organizations. + +To “modify” a work means to copy from or adapt all or part of the work in +a fashion requiring copyright permission, other than the making of an exact copy. The +resulting work is called a “modified version” of the earlier work or a +work “based on” the earlier work. + +A “covered work” means either the unmodified Program or a work based on +the Program. + +To “propagate” a work means to do anything with it that, without +permission, would make you directly or secondarily liable for infringement under +applicable copyright law, except executing it on a computer or modifying a private +copy. Propagation includes copying, distribution (with or without modification), +making available to the public, and in some countries other activities as well. + +To “convey” a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through a computer +network, with no transfer of a copy, is not conveying. + +An interactive user interface displays “Appropriate Legal Notices” to the +extent that it includes a convenient and prominently visible feature that (1) +displays an appropriate copyright notice, and (2) tells the user that there is no +warranty for the work (except to the extent that warranties are provided), that +licensees may convey the work under this License, and how to view a copy of this +License. If the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + +### 1. Source Code. + +The “source code” for a work means the preferred form of the work for +making modifications to it. “Object code” means any non-source form of a +work. + +A “Standard Interface” means an interface that either is an official +standard defined by a recognized standards body, or, in the case of interfaces +specified for a particular programming language, one that is widely used among +developers working in that language. + +The “System Libraries” of an executable work include anything, other than +the work as a whole, that (a) is included in the normal form of packaging a Major +Component, but which is not part of that Major Component, and (b) serves only to +enable use of the work with that Major Component, or to implement a Standard +Interface for which an implementation is available to the public in source code form. +A “Major Component”, in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system (if any) on which +the executable work runs, or a compiler used to produce the work, or an object code +interpreter used to run it. + +The “Corresponding Source” for a work in object code form means all the +source code needed to generate, install, and (for an executable work) run the object +code and to modify the work, including scripts to control those activities. However, +it does not include the work's System Libraries, or general-purpose tools or +generally available free programs which are used unmodified in performing those +activities but which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for the work, and +the source code for shared libraries and dynamically linked subprograms that the work +is specifically designed to require, such as by intimate data communication or +control flow between those subprograms and other parts of the work. + +The Corresponding Source need not include anything that users can regenerate +automatically from other parts of the Corresponding Source. + +The Corresponding Source for a work in source code form is that same work. + +### 2. Basic Permissions. + +All rights granted under this License are granted for the term of copyright on the +Program, and are irrevocable provided the stated conditions are met. This License +explicitly affirms your unlimited permission to run the unmodified Program. The +output from running a covered work is covered by this License only if the output, +given its content, constitutes a covered work. This License acknowledges your rights +of fair use or other equivalent, as provided by copyright law. + +You may make, run and propagate covered works that you do not convey, without +conditions so long as your license otherwise remains in force. You may convey covered +works to others for the sole purpose of having them make modifications exclusively +for you, or provide you with facilities for running those works, provided that you +comply with the terms of this License in conveying all material for which you do not +control copyright. Those thus making or running the covered works for you must do so +exclusively on your behalf, under your direction and control, on terms that prohibit +them from making any copies of your copyrighted material outside their relationship +with you. + +Conveying under any other circumstances is permitted solely under the conditions +stated below. Sublicensing is not allowed; section 10 makes it unnecessary. + +### 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + +No covered work shall be deemed part of an effective technological measure under any +applicable law fulfilling obligations under article 11 of the WIPO copyright treaty +adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention +of such measures. + +When you convey a covered work, you waive any legal power to forbid circumvention of +technological measures to the extent such circumvention is effected by exercising +rights under this License with respect to the covered work, and you disclaim any +intention to limit operation or modification of the work as a means of enforcing, +against the work's users, your or third parties' legal rights to forbid circumvention +of technological measures. + +### 4. Conveying Verbatim Copies. + +You may convey verbatim copies of the Program's source code as you receive it, in any +medium, provided that you conspicuously and appropriately publish on each copy an +appropriate copyright notice; keep intact all notices stating that this License and +any non-permissive terms added in accord with section 7 apply to the code; keep +intact all notices of the absence of any warranty; and give all recipients a copy of +this License along with the Program. + +You may charge any price or no price for each copy that you convey, and you may offer +support or warranty protection for a fee. + +### 5. Conveying Modified Source Versions. + +You may convey a work based on the Program, or the modifications to produce it from +the Program, in the form of source code under the terms of section 4, provided that +you also meet all of these conditions: + +* **a)** The work must carry prominent notices stating that you modified it, and giving a +relevant date. +* **b)** The work must carry prominent notices stating that it is released under this +License and any conditions added under section 7. This requirement modifies the +requirement in section 4 to “keep intact all notices”. +* **c)** You must license the entire work, as a whole, under this License to anyone who +comes into possession of a copy. This License will therefore apply, along with any +applicable section 7 additional terms, to the whole of the work, and all its parts, +regardless of how they are packaged. This License gives no permission to license the +work in any other way, but it does not invalidate such permission if you have +separately received it. +* **d)** If the work has interactive user interfaces, each must display Appropriate Legal +Notices; however, if the Program has interactive interfaces that do not display +Appropriate Legal Notices, your work need not make them do so. + +A compilation of a covered work with other separate and independent works, which are +not by their nature extensions of the covered work, and which are not combined with +it such as to form a larger program, in or on a volume of a storage or distribution +medium, is called an “aggregate” if the compilation and its resulting +copyright are not used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work in an aggregate +does not cause this License to apply to the other parts of the aggregate. + +### 6. Conveying Non-Source Forms. + +You may convey a covered work in object code form under the terms of sections 4 and +5, provided that you also convey the machine-readable Corresponding Source under the +terms of this License, in one of these ways: + +* **a)** Convey the object code in, or embodied in, a physical product (including a +physical distribution medium), accompanied by the Corresponding Source fixed on a +durable physical medium customarily used for software interchange. +* **b)** Convey the object code in, or embodied in, a physical product (including a +physical distribution medium), accompanied by a written offer, valid for at least +three years and valid for as long as you offer spare parts or customer support for +that product model, to give anyone who possesses the object code either (1) a copy of +the Corresponding Source for all the software in the product that is covered by this +License, on a durable physical medium customarily used for software interchange, for +a price no more than your reasonable cost of physically performing this conveying of +source, or (2) access to copy the Corresponding Source from a network server at no +charge. +* **c)** Convey individual copies of the object code with a copy of the written offer to +provide the Corresponding Source. This alternative is allowed only occasionally and +noncommercially, and only if you received the object code with such an offer, in +accord with subsection 6b. +* **d)** Convey the object code by offering access from a designated place (gratis or for +a charge), and offer equivalent access to the Corresponding Source in the same way +through the same place at no further charge. You need not require recipients to copy +the Corresponding Source along with the object code. If the place to copy the object +code is a network server, the Corresponding Source may be on a different server +(operated by you or a third party) that supports equivalent copying facilities, +provided you maintain clear directions next to the object code saying where to find +the Corresponding Source. Regardless of what server hosts the Corresponding Source, +you remain obligated to ensure that it is available for as long as needed to satisfy +these requirements. +* **e)** Convey the object code using peer-to-peer transmission, provided you inform +other peers where the object code and Corresponding Source of the work are being +offered to the general public at no charge under subsection 6d. + +A separable portion of the object code, whose source code is excluded from the +Corresponding Source as a System Library, need not be included in conveying the +object code work. + +A “User Product” is either (1) a “consumer product”, which +means any tangible personal property which is normally used for personal, family, or +household purposes, or (2) anything designed or sold for incorporation into a +dwelling. In determining whether a product is a consumer product, doubtful cases +shall be resolved in favor of coverage. For a particular product received by a +particular user, “normally used” refers to a typical or common use of +that class of product, regardless of the status of the particular user or of the way +in which the particular user actually uses, or expects or is expected to use, the +product. A product is a consumer product regardless of whether the product has +substantial commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + +“Installation Information” for a User Product means any methods, +procedures, authorization keys, or other information required to install and execute +modified versions of a covered work in that User Product from a modified version of +its Corresponding Source. The information must suffice to ensure that the continued +functioning of the modified object code is in no case prevented or interfered with +solely because modification has been made. + +If you convey an object code work under this section in, or with, or specifically for +use in, a User Product, and the conveying occurs as part of a transaction in which +the right of possession and use of the User Product is transferred to the recipient +in perpetuity or for a fixed term (regardless of how the transaction is +characterized), the Corresponding Source conveyed under this section must be +accompanied by the Installation Information. But this requirement does not apply if +neither you nor any third party retains the ability to install modified object code +on the User Product (for example, the work has been installed in ROM). + +The requirement to provide Installation Information does not include a requirement to +continue to provide support service, warranty, or updates for a work that has been +modified or installed by the recipient, or for the User Product in which it has been +modified or installed. Access to a network may be denied when the modification itself +materially and adversely affects the operation of the network or violates the rules +and protocols for communication across the network. + +Corresponding Source conveyed, and Installation Information provided, in accord with +this section must be in a format that is publicly documented (and with an +implementation available to the public in source code form), and must require no +special password or key for unpacking, reading or copying. + +### 7. Additional Terms. + +“Additional permissions” are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. Additional +permissions that are applicable to the entire Program shall be treated as though they +were included in this License, to the extent that they are valid under applicable +law. If additional permissions apply only to part of the Program, that part may be +used separately under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + +When you convey a copy of a covered work, you may at your option remove any +additional permissions from that copy, or from any part of it. (Additional +permissions may be written to require their own removal in certain cases when you +modify the work.) You may place additional permissions on material, added by you to a +covered work, for which you have or can give appropriate copyright permission. + +Notwithstanding any other provision of this License, for material you add to a +covered work, you may (if authorized by the copyright holders of that material) +supplement the terms of this License with terms: + +* **a)** Disclaiming warranty or limiting liability differently from the terms of +sections 15 and 16 of this License; or +* **b)** Requiring preservation of specified reasonable legal notices or author +attributions in that material or in the Appropriate Legal Notices displayed by works +containing it; or +* **c)** Prohibiting misrepresentation of the origin of that material, or requiring that +modified versions of such material be marked in reasonable ways as different from the +original version; or +* **d)** Limiting the use for publicity purposes of names of licensors or authors of the +material; or +* **e)** Declining to grant rights under trademark law for use of some trade names, +trademarks, or service marks; or +* **f)** Requiring indemnification of licensors and authors of that material by anyone +who conveys the material (or modified versions of it) with contractual assumptions of +liability to the recipient, for any liability that these contractual assumptions +directly impose on those licensors and authors. + +All other non-permissive additional terms are considered “further +restrictions” within the meaning of section 10. If the Program as you received +it, or any part of it, contains a notice stating that it is governed by this License +along with a term that is a further restriction, you may remove that term. If a +license document contains a further restriction but permits relicensing or conveying +under this License, you may add to a covered work material governed by the terms of +that license document, provided that the further restriction does not survive such +relicensing or conveying. + +If you add terms to a covered work in accord with this section, you must place, in +the relevant source files, a statement of the additional terms that apply to those +files, or a notice indicating where to find the applicable terms. + +Additional terms, permissive or non-permissive, may be stated in the form of a +separately written license, or stated as exceptions; the above requirements apply +either way. + +### 8. Termination. + +You may not propagate or modify a covered work except as expressly provided under +this License. Any attempt otherwise to propagate or modify it is void, and will +automatically terminate your rights under this License (including any patent licenses +granted under the third paragraph of section 11). + +However, if you cease all violation of this License, then your license from a +particular copyright holder is reinstated (a) provisionally, unless and until the +copyright holder explicitly and finally terminates your license, and (b) permanently, +if the copyright holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + +Moreover, your license from a particular copyright holder is reinstated permanently +if the copyright holder notifies you of the violation by some reasonable means, this +is the first time you have received notice of violation of this License (for any +work) from that copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + +Termination of your rights under this section does not terminate the licenses of +parties who have received copies or rights from you under this License. If your +rights have been terminated and not permanently reinstated, you do not qualify to +receive new licenses for the same material under section 10. + +### 9. Acceptance Not Required for Having Copies. + +You are not required to accept this License in order to receive or run a copy of the +Program. Ancillary propagation of a covered work occurring solely as a consequence of +using peer-to-peer transmission to receive a copy likewise does not require +acceptance. However, nothing other than this License grants you permission to +propagate or modify any covered work. These actions infringe copyright if you do not +accept this License. Therefore, by modifying or propagating a covered work, you +indicate your acceptance of this License to do so. + +### 10. Automatic Licensing of Downstream Recipients. + +Each time you convey a covered work, the recipient automatically receives a license +from the original licensors, to run, modify and propagate that work, subject to this +License. You are not responsible for enforcing compliance by third parties with this +License. + +An “entity transaction” is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an organization, or +merging organizations. If propagation of a covered work results from an entity +transaction, each party to that transaction who receives a copy of the work also +receives whatever licenses to the work the party's predecessor in interest had or +could give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if the predecessor +has it or can get it with reasonable efforts. + +You may not impose any further restrictions on the exercise of the rights granted or +affirmed under this License. For example, you may not impose a license fee, royalty, +or other charge for exercise of rights granted under this License, and you may not +initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging +that any patent claim is infringed by making, using, selling, offering for sale, or +importing the Program or any portion of it. + +### 11. Patents. + +A “contributor” is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The work thus +licensed is called the contributor's “contributor version”. + +A contributor's “essential patent claims” are all patent claims owned or +controlled by the contributor, whether already acquired or hereafter acquired, that +would be infringed by some manner, permitted by this License, of making, using, or +selling its contributor version, but do not include claims that would be infringed +only as a consequence of further modification of the contributor version. For +purposes of this definition, “control” includes the right to grant patent +sublicenses in a manner consistent with the requirements of this License. + +Each contributor grants you a non-exclusive, worldwide, royalty-free patent license +under the contributor's essential patent claims, to make, use, sell, offer for sale, +import and otherwise run, modify and propagate the contents of its contributor +version. + +In the following three paragraphs, a “patent license” is any express +agreement or commitment, however denominated, not to enforce a patent (such as an +express permission to practice a patent or covenant not to sue for patent +infringement). To “grant” such a patent license to a party means to make +such an agreement or commitment not to enforce a patent against the party. + +If you convey a covered work, knowingly relying on a patent license, and the +Corresponding Source of the work is not available for anyone to copy, free of charge +and under the terms of this License, through a publicly available network server or +other readily accessible means, then you must either (1) cause the Corresponding +Source to be so available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner consistent with +the requirements of this License, to extend the patent license to downstream +recipients. “Knowingly relying” means you have actual knowledge that, but +for the patent license, your conveying the covered work in a country, or your +recipient's use of the covered work in a country, would infringe one or more +identifiable patents in that country that you have reason to believe are valid. + +If, pursuant to or in connection with a single transaction or arrangement, you +convey, or propagate by procuring conveyance of, a covered work, and grant a patent +license to some of the parties receiving the covered work authorizing them to use, +propagate, modify or convey a specific copy of the covered work, then the patent +license you grant is automatically extended to all recipients of the covered work and +works based on it. + +A patent license is “discriminatory” if it does not include within the +scope of its coverage, prohibits the exercise of, or is conditioned on the +non-exercise of one or more of the rights that are specifically granted under this +License. You may not convey a covered work if you are a party to an arrangement with +a third party that is in the business of distributing software, under which you make +payment to the third party based on the extent of your activity of conveying the +work, and under which the third party grants, to any of the parties who would receive +the covered work from you, a discriminatory patent license (a) in connection with +copies of the covered work conveyed by you (or copies made from those copies), or (b) +primarily for and in connection with specific products or compilations that contain +the covered work, unless you entered into that arrangement, or that patent license +was granted, prior to 28 March 2007. + +Nothing in this License shall be construed as excluding or limiting any implied +license or other defenses to infringement that may otherwise be available to you +under applicable patent law. + +### 12. No Surrender of Others' Freedom. + +If conditions are imposed on you (whether by court order, agreement or otherwise) +that contradict the conditions of this License, they do not excuse you from the +conditions of this License. If you cannot convey a covered work so as to satisfy +simultaneously your obligations under this License and any other pertinent +obligations, then as a consequence you may not convey it at all. For example, if you +agree to terms that obligate you to collect a royalty for further conveying from +those to whom you convey the Program, the only way you could satisfy both those terms +and this License would be to refrain entirely from conveying the Program. + +### 13. Use with the GNU Affero General Public License. + +Notwithstanding any other provision of this License, you have permission to link or +combine any covered work with a work licensed under version 3 of the GNU Affero +General Public License into a single combined work, and to convey the resulting work. +The terms of this License will continue to apply to the part which is the covered +work, but the special requirements of the GNU Affero General Public License, section +13, concerning interaction through a network will apply to the combination as such. + +### 14. Revised Versions of this License. + +The Free Software Foundation may publish revised and/or new versions of the GNU +General Public License from time to time. Such new versions will be similar in spirit +to the present version, but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Program specifies that +a certain numbered version of the GNU General Public License “or any later +version” applies to it, you have the option of following the terms and +conditions either of that numbered version or of any later version published by the +Free Software Foundation. If the Program does not specify a version number of the GNU +General Public License, you may choose any version ever published by the Free +Software Foundation. + +If the Program specifies that a proxy can decide which future versions of the GNU +General Public License can be used, that proxy's public statement of acceptance of a +version permanently authorizes you to choose that version for the Program. + +Later license versions may give you additional or different permissions. However, no +additional obligations are imposed on any author or copyright holder as a result of +your choosing to follow a later version. + +### 15. Disclaimer of Warranty. + +THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM “AS IS” WITHOUT WARRANTY OF ANY KIND, EITHER +EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE +QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE +DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + +### 16. Limitation of Liability. + +IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY +COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS +PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, +INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE +OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE +WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + +### 17. Interpretation of Sections 15 and 16. + +If the disclaimer of warranty and limitation of liability provided above cannot be +given local legal effect according to their terms, reviewing courts shall apply local +law that most closely approximates an absolute waiver of all civil liability in +connection with the Program, unless a warranty or assumption of liability accompanies +a copy of the Program in return for a fee. \ No newline at end of file diff --git a/README.md b/README.md index c8f3d64..69111be 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,82 @@ # raven-python -The Python counterpart of the -[RAVEN Toolbox 2](https://github.com/SysBioChalmers/RAVEN) (MATLAB), built on -[cobrapy](https://github.com/opencobra/cobrapy). - -`raven-python` covers de-novo reconstruction (KEGG + protein homology), -context-specific model extraction (`tINIT` / `ftINIT`), metabolic-task -validation, gap-filling, omics ingestion, sub-cellular localisation, model -manipulation, and YAML / SIF / Excel I/O — preserving the established RAVEN -workflows in a Python-native form. - -This `main` branch is intentionally empty. Development happens on the -`develop` branch via a series of feature branches; see the open and merged -pull requests for the current state of the port. +[![CI](https://github.com/SysBioChalmers/raven-python/actions/workflows/ci.yml/badge.svg)](https://github.com/SysBioChalmers/raven-python/actions/workflows/ci.yml) + +**Reconstruction, Analysis and Visualisation of Metabolic Networks — in Python.** + +`raven-python` is the Python counterpart of the +[RAVEN Toolbox 2](https://github.com/SysBioChalmers/RAVEN) (MATLAB). It builds on +[**cobrapy**](https://github.com/opencobra/cobrapy) for everything cobrapy already does +well (simulation, standard analyses, SBML I/O, model manipulation) and adds the +functionality that's unique to RAVEN: + +* **De novo reconstruction** from KEGG and protein homology (BLAST / DIAMOND). +* **Context-specific models** from omics data via **tINIT / ftINIT**, with task-aware + gap-filling and the linear-merge MILP reduction. +* **Metabolic-task** validation (`check_tasks`, `fitTasks`). +* **Connectivity gap-filling** against template models. +* **Omics integration** — Human Protein Atlas (proteomics + RNA-seq) ingestion. +* **Sub-cellular localisation** prediction by MILP, with partial-update mode and + pluggable predictors (WoLF PSORT, DeepLoc, …). +* **N-model comparison**; **reporter metabolites**; **FSEOF**; **flux sampling**. +* **YAML I/O** following the cobra standard, plus geckopy's `ec-*` enzyme-constrained + fields. **SIF** export. **RAVEN-style Excel** export. + +The status of every RAVEN function (ported, cheatsheet-mapped to cobra, or explicitly +not ported) is documented function-by-function in +**[docs/raven_migration.md](docs/raven_migration.md)**. + +## Design principle + +The canonical in-memory object is always a [`cobra.Model`](https://cobrapy.readthedocs.io). +There is no parallel RAVEN struct, no `ravenCobraWrapper`-style adapter. RAVEN-specific +fields that cobra doesn't model natively (`rxnMiriams`, `metDeltaG`, +`rxnConfidenceScores`, …) live in cobra's `annotation` / `notes` dictionaries. This +avoids duplicating cobra's data model and keeps raven-python interoperable with the wider +COBRA ecosystem. + +## Status + +raven-python has been validated against MATLAB RAVEN on **Human-GEM** (5 Hart2015 cell-line +models, Jaccard 0.975–0.980 — see [docs/humangem_validation.md](docs/humangem_validation.md)). +The functional scope of the original RAVEN toolbox is covered with two principled +omissions: + +* **MetaCyc-based reconstruction** is not implemented and is flagged for removal from + MATLAB RAVEN as well — see [IMPROVEMENTS.md](IMPROVEMENTS.md) under `R-MetaCyc`. +* **Dynamic FBA** is not implemented — several maintained Python packages already cover + it ([`dfba`](https://pypi.org/project/dfba/), [`reframed`](https://pypi.org/project/reframed/), + [`mewpy`](https://pypi.org/project/mewpy/)). + +What's still open is catalogued in **[docs/todo.md](docs/todo.md)** (visualisation / Phase +6 is the main item). + +## Installation (development) + +```bash +git clone https://github.com/SysBioChalmers/raven-python +cd raven-python +pip install -e ".[dev]" +``` + +raven-python requires Python ≥ 3.11. Genome-scale (f)tINIT MILPs currently require **Gurobi** +([details on solver portability](docs/init_solver_benchmark.md)); toy and unit-test work +runs on the open-source GLPK. + +## Documentation + +See **[docs/README.md](docs/README.md)** for the documentation index. + +## Relationship to MATLAB RAVEN + +`raven-python` is a derivative work and is released under the same **GPL-3.0-or-later** +license. If you use it in scientific work, please cite the RAVEN 2 paper: + +> Wang H, Marcišauskas S, Sánchez BJ, Domenzain I, Hermansson D, Agren R, Nielsen J, +> Kerkhoven EJ. (2018) RAVEN 2.0: A versatile toolbox for metabolic network +> reconstruction and a case study on *Streptomyces coelicolor*. PLoS Comput Biol 14(10): +> e1006541. + +## License + +[GPL-3.0-or-later](LICENSE) diff --git a/docs/humangem_validation.md b/docs/humangem_validation.md new file mode 100644 index 0000000..bbaae6b --- /dev/null +++ b/docs/humangem_validation.md @@ -0,0 +1,117 @@ +# Human-GEM cell-type model validation: raven-python vs RAVEN + +Validation of raven-python's tINIT/ftINIT against MATLAB RAVEN on a real genome-scale +reconstruction (Human-GEM) using the Hart2015 RNA-seq dataset (5 cell lines: DLD1, +GBM, HCT116, HELA, RPE1). The goal is functional equivalence — do raven-python and RAVEN +extract the *same* context-specific reaction sets from the same inputs? + +## Method + +* **Template & inputs.** RAVEN built the ftINIT reference model from Human-GEM + (`prepHumanModelForftINIT`: remove drug/exchange/artificial reactions, set + spontaneous/custom lists) and exported it as `raven_refModel.xml` (10198 reactions). + raven-python builds on that *same* exported model, so the candidate reaction universe is + identical and set comparison is exact. +* **Scoring.** Gene scores from `log2(TPM+1)`-style expression via + `gene_scores_from_expression`, mapped to reactions through the GPR + (`score_reactions_from_genes`), matching RAVEN's `getExprForRxnScore`. +* **ftINIT.** Series `1+1` (2 staged MILP steps). RAVEN run via `ftINIT.m` with Gurobi; + raven-python via `raven_python.init.ftinit` with Gurobi (`mip_gap=0.001`, `time_limit=600`). +* **tINIT.** raven-python `get_init_model` (classic single-MILP INIT) on HCT116, compared to + the ftINIT result for the same cell line. +* **Tasks.** Two raven-python ftINIT variants: *no-task* (expression only) and + *task-constrained* (essential metabolic tasks, `metabolicTasks_Essential.txt`, force + task-essential reactions to be kept). RAVEN's reference is task-constrained. +* **Solver.** Gurobi 13.0.1 for both tools. + +## Engineering findings (raven-python tractability) + +Getting ftINIT to run at genome scale surfaced three issues, all now fixed and matching +RAVEN's design: + +1. **O(n²) constraint construction.** Building the steady-state balances with Python + `sum()` re-canonicalises a growing sympy expression at each term; hub metabolites + (ATP/H⁺/H₂O in ~10³ reactions) made one constraint take ~minutes (≈154 s total build, + benchmark: 1500-term `sum` = 59 s vs `optlang.symbolics.add` = 0.01 s). Fixed by + building flat term lists once per reaction and summing with `optlang.symbolics.add` + (in both ftINIT and tINIT). +2. **Big-M too loose.** The on/off indicator constraints used each reaction's own bound + (±1000) as big-M; with `force_on=0.1` that is a ~10⁴ ratio → very weak LP relaxation + → Gurobi never closes the gap. RAVEN uses a fixed big-M = 100. Adopted. +3. **Stoichiometric rescaling.** A fixed big-M=100 is only valid if no reaction needs + flux ≫100; ported RAVEN's `rescaleModelForINIT` (cap each reaction's coefficient + dynamic range at 25×, normalise mean |coeff| to 1) into `prep_init_model`. Without it + the staged MILP is infeasible (step-1 caps transports that step-0 used freely). + +Net effect: a full ftINIT cell-line solve went from *not finishing* to ~200 s, +comparable to RAVEN. + +## Results + +### Reaction counts + +| cell line | RAVEN ftINIT | raven-python ftINIT (no-task) | raven-python ftINIT (task) | +|-----------|-------------:|--------------------------:|-----------------------:| +| DLD1 | 7782 | 7744 | 7774 | +| GBM | 7668 | 7667 | 7680 | +| HCT116 | 7780 | 7752 | 7776 | +| HELA | 7832 | 7789 | 7816 | +| RPE1 | 7569 | 7564 | 7570 | + +Counts agree within ~0.5 % everywhere; the task-constrained run is closest (e.g. RPE1 +7570 vs 7569, HCT116 7776 vs 7780). raven-python tINIT (HCT116) gives 6024 reactions — a +smaller model, as expected from the different (classic INIT) objective. + +### Agreement — raven-python (no-task) ftINIT vs RAVEN ftINIT + +| cell line | shared | only raven-python | only RAVEN | Jaccard | +|-----------|-------:|--------------:|-----------:|--------:| +| DLD1 | 7667 | 77 | 115 | 0.976 | +| GBM | 7562 | 105 | 106 | 0.973 | +| HCT116 | 7675 | 77 | 105 | 0.977 | +| HELA | 7707 | 82 | 125 | 0.974 | +| RPE1 | 7470 | 94 | 99 | 0.975 | + +**~97.5 % of reactions are identical** between the two independent implementations, even +though this run is *expression-only* while RAVEN's reference is task-constrained. The +"only RAVEN" surplus (≈99–125) is expected to include task-essential reactions that the +task-constrained run (below) recovers. + +### Agreement — raven-python (task-constrained) ftINIT vs RAVEN ftINIT + +| cell line | shared | only raven-python | only RAVEN | Jaccard | +|-----------|-------:|--------------:|-----------:|--------:| +| DLD1 | 7699 | 75 | 83 | 0.980 | +| GBM | 7588 | 92 | 80 | 0.978 | +| HCT116 | 7696 | 80 | 84 | 0.979 | +| HELA | 7735 | 81 | 97 | 0.978 | +| RPE1 | 7493 | 77 | 76 | 0.980 | + +Adding the essential metabolic tasks (same task list RAVEN uses) raises agreement to +**Jaccard 0.978–0.980** and makes the disagreement symmetric (only-raven-python ≈ only-RAVEN +≈ 80), confirming the prediction: the task constraints recover RAVEN's task-essential +reactions. The residual ≈80 reactions each way out of ~7700 is at the level expected from +MIP-gap tolerance (both accept near-optimal incumbents) and alternate optima. + +### raven-python tINIT vs ftINIT (HCT116) + +tINIT 6024 rxns vs ftINIT 7752; shared 5957, Jaccard 0.762. tINIT is nearly a subset +(only 67 reactions unique to it) — the two methods agree on a common core, with ftINIT +keeping more (its staged formulation and task handling are less aggressive at removal). +This matches the expected tINIT/ftINIT relationship rather than indicating a defect. + +## Conclusions + +From identical inputs on a genome-scale human reconstruction, raven-python reproduces RAVEN's +ftINIT reaction selection to **97.5 % (no-task) and 98 % (task-constrained) set identity** +across five cell lines — strong evidence of functional equivalence between the two +independent implementations. Agreement is symmetric and the residual (~80 reactions each +way) is consistent with MIP near-optimality and alternate optima rather than any +systematic divergence. + +Reaching genome-scale tractability required matching RAVEN's numerical-conditioning +choices and fixing optlang-specific construction costs (see *Engineering findings*): +fixed big-M = 100, `rescaleModelForINIT`, `optlang.symbolics.add` instead of Python +`sum()` in every MILP build (ftINIT, tINIT, and the gap-fill). With these, a +task-constrained cell-line model builds in ~15–25 min (dominated by the +essential-forced staged MILP) and a no-task one in ~3 min, comparable to RAVEN. diff --git a/docs/init_param_calibration.md b/docs/init_param_calibration.md new file mode 100644 index 0000000..cc69314 --- /dev/null +++ b/docs/init_param_calibration.md @@ -0,0 +1,342 @@ +# (f)tINIT parameter calibration & input-robustness + +Empirical study of raven-python's (f)tINIT parameters on a genome-scale model (Human-GEM, +Hart2015 / HCT116). Two questions: + +1. **Calibration** — on clean data, which parameter values give the best speed/quality + trade-off? (`scripts/analyze_init_params.py`) +2. **Robustness** — with the task layer always on (it is part of the pipeline, not a + variable), how does degrading the *transcriptomics input* affect the model, and which + parameters keep it functional and stable? (`scripts/analyze_init_robustness.py`) + +Both scripts are resumable and reusable on any model/dataset; the numbers below are HCT116. +"Jaccard" is reaction-set overlap with the reference (tightest setting / clean data) — for +a model-extraction tool the reaction set is the product, and a MIP gap bounds only the +*objective*, so set-stability is tracked separately. + +--- + +## 1. Clean-data calibration + +### ftINIT MILP — `mip_gap` (single step-0 solve, big_m=100, force_on=0.1) + +| mip_gap | time (s) | objective | rel.obj.gap | Jaccard vs tightest | +|--------:|---------:|----------:|------------:|--------------------:| +| 0.0002 | 48 | 49357 | ref | ref | +| 0.001 | 44 | 49357 | +0.0000 | **1.0000** | +| 0.003 | 42 | 49289 | +0.0014 | 0.9973 | +| 0.01 | 42 | 49185 | +0.0035 | 0.9935 | +| 0.03 | 52 | 49185 | +0.0035 | 0.9935 | +| 0.1 | 46 | 45615 | +0.0758 | 0.9469 | + +**Solve time is essentially flat across the gap** (the model build dominates), so a tight +gap is nearly free. `mip_gap=0.001` reproduces the proven optimum exactly (Jaccard 1.0); +quality only collapses at 0.1. → **Default 0.001.** (The genome-scale staged pipeline still +needs *some* gap + a `time_limit` because the full essential-forced MILP can be much harder +than this single step — see robustness timings.) + +### ftINIT MILP — `big_m` (gap=0.001, force_on=0.1) + +| big_m | time (s) | rel.obj.gap | Jaccard vs big_m=100 | +|------:|---------:|------------:|---------------------:| +| 100 | 51 | ref | ref | +| 50 | 54 | +0.0006 | 0.983 | +| 25 | 53 | +0.0007 | 0.982 | +| 250 | 55 | +0.0005 | 0.984 | +| 1000 | 59 | +0.0001 | 0.986 | + +At step-0 (on the *scaled* model) `big_m` barely affects objective or time, but shifts which +reactions are kept by ~2% (alternate optima). `big_m=100` is RAVEN's value and is required +for the *staged* pipeline to stay feasible (a fixed 100 is only valid with stoichiometric +rescaling — see §1.4). → **Default 100.** + +### ftINIT MILP — `force_on` (gap=0.001, big_m=100) + +| force_on | time (s) | rel.obj.gap | Jaccard vs 0.1 | +|---------:|---------:|------------:|---------------:| +| 0.1 | 63 | ref | ref | +| 0.02 | 69 | +0.0005 | 0.983 | +| 0.05 | 56 | +0.0000 | 0.990 | +| 0.2 | 59 | +0.0004 | 0.982 | +| 0.5 | 79 | +0.0005 | 0.985 | + +`force_on` (minimum flux for a reaction to count as "on") changes the *model*, not just a +tolerance, but the reaction set is fairly insensitive (Jaccard ≥0.98) and the objective +hardly moves. → **Default 0.1** (RAVEN), no strong reason to change. + +### prep scaling — `rescaleModelForINIT` `max_stoich_diff` and on/off (gap=0.001, big_m=100) + +| config | time (s) | rel.obj.gap | Jaccard vs scaled msd=25 | +|--------|---------:|------------:|-------------------------:| +| scale on, msd=25 | 51 | ref | ref | +| msd=10 | 49 | +0.0075 | 0.989 | +| msd=50 | 61 | +0.0003 | 0.982 | +| msd=100 | 62 | −0.0001 | 0.986 | +| scale off | 45 | +0.0129 | 0.973 | + +At step-0 even `scale=off` is feasible, but it drifts most (Jaccard 0.973, objective +1.3%); +`max_stoich_diff` 10–100 are all within ~1%. **This understates scaling's importance** — at +step-0 there is no big-M cap on the held-out transports. In the *full staged pipeline*, +`scale=off` with `big_m=100` is **infeasible** (step-1 caps transports that step-0 used +freely). → **Keep scaling on, msd=25** (RAVEN's default). + +**Calibration summary (defaults are well-chosen):** `mip_gap=0.001`, `big_m=100`, +`force_on=0.1`, scaling on (`max_stoich_diff=25`). For the genome-scale staged pipeline also +set a `time_limit` (≈120–600 s/step) so a hard essential-forced step returns a near-optimal +incumbent rather than grinding. + +### tINIT MILP (`get_init_model`, `essential_rxns=[]`, `time_limit=400s`) + +**mip_gap** (eps=1, prod_weight=0.5): + +| mip_gap | time (s) | n_kept | Jaccard vs gap=0.001 | +|--------:|---------:|-------:|---------------------:| +| 0.001 | 901 | 6024 | ref | +| 0.003 | 869 | 6036 | 0.991 | +| 0.01 | 595 | 5967 | 0.968 | + +Tightening the gap costs ~50% more wall time on this MILP (unlike ftINIT step-0, build +doesn't dominate); a 1% gap is ~30% faster with ~3% reaction-set drift. +→ **`mip_gap=0.001`** for stability, **0.01** for a faster looser solve. + +**eps** (gap=0.005, the connectivity-flux threshold — *changes the model*): + +| eps | n_kept | Jaccard vs eps=1.0 | +|----:|-------:|-------------------:| +| 0.1 | 6119 | 0.952 | +| 0.5 | 6123 | 0.952 | +| 1.0 | 6064 | ref | +| 2.0 | 6090 | 0.960 | + +Each `eps` value gives a slightly different model (Jaccard ~0.95 across the range); the +reaction-set spread is ~5%. `eps=1.0` is RAVEN's default; smaller values produce *slightly* +larger models (loosen the connectivity bar). Pick by what the data justifies — see the +caveat at the top of `init.py`. + +**prod_weight** (gap=0.005, the metabolite-production reward — *changes the model*): + +| prod_weight | n_kept | Jaccard vs 0.5 | +|------------:|-------:|---------------:| +| 0.0 | 5973 | 0.961 | +| 0.25 | 6015 | 0.974 | +| 0.5 | 6064 | ref | +| 1.0 | 6106 | 0.955 | + +A higher `prod_weight` keeps slightly more reactions (rewards more connectivity); `0.5` +(RAVEN's default) is the middle of the range. Effect is modest (~5%). + +**big_m** (gap=0.005, `None` = per-reaction `ub`): + +| big_m | n_kept | Jaccard vs None | +|------:|-------:|----------------:| +| None (per-rxn ub) | 6064 | ref | +| 1000 | 6064 | **1.000** | +| 250 | 6114 | 0.953 | +| 100 | 6023 | 0.930 | + +`big_m=1000` is identical to `big_m=None` here because the model's `ub` is ±1000 already +(so the per-reaction cap *is* 1000). Smaller fixed caps (250, 100) shift alternate optima +by 5–7% but do not change the objective. Unlike ftINIT, tINIT has *not* been run through +`rescaleModelForINIT`, so dropping `big_m` below 1000 may invalidate the LP feasibility +region for reactions whose own bound is larger — keep the default (per-reaction `ub`). + +**tINIT calibration summary:** `mip_gap=0.001` (or 0.01 for ~30% speedup at ~3% drift); +`eps`, `prod_weight`, `big_m` defaults are fine — they all change the *model*, not just +tolerance, so tune by what the data and biology call for, not by these tables. + +### ftINIT full pipeline (`ftinit`, series='1+1', no-task scaled prep, `time_limit=600s`) + +| config | time (s) | n_kept | Jaccard vs gap=0.001 | +|--------|---------:|-------:|---------------------:| +| **mip_gap=0.001** (default big_m=100) | 346 | 7752 | ref | +| mip_gap=0.003 | 288 | 7748 | 0.993 | +| mip_gap=0.01 | 218 | 7746 | **0.995** | +| big_m=50 (gap=0.003) | 738 | 7799 | 0.974 | +| big_m=250 (gap=0.003) | 345 | 7766 | 0.977 | + +Unlike the single-step ftINIT MILP in §1.1 (where build time dominated and the gap was +free), **the full pipeline does benefit from a looser gap**: `mip_gap=0.01` is ~37 % +faster than `0.001` with Jaccard 0.995 — essentially the same model. → **For genome-scale +ftINIT, `mip_gap=0.01` (or 0.005) is the sweet spot**; keep 0.001 only if exact +reproducibility matters more than a few minutes. + +`big_m=50` is actually *slower* than the default 100 (738s vs 346s) — a tighter cap makes +the LP relaxation harder for borderline reactions; `big_m=250` is the same speed as 100 +but shifts the reaction set ~2 %. → **Keep `big_m=100`** (RAVEN's value, what scaling is +designed for). + +### tINIT + many task-essential reactions: a structural limitation + +ftINIT's task layer (gap-fill) and tINIT's task layer (forcing `essential_rxns`) are +*not equivalent*. tINIT forces every essential reaction to carry `flux ≥ eps`. With +Human-GEM's 113 task-essential reactions (the validation set), the resulting steady-state +system is infeasible regardless of `eps`: + +| essentials passed to `run_init` | result | +|---|---| +| 0 (the original validation call) | ✅ ok, 6024 reactions | +| 113 (merged-survivor IDs from `prep.essential_rxns`) | ❌ `infeasible` (proven by Gurobi presolve, ~330s) | +| 260 (pre-merge IDs from `find_task_essential_reactions` cache) | ❌ `infeasible` (~480s) | + +Lowering `eps` (1.0 → 0.1) does **not** fix it; the issue is that 100+ reactions cannot +simultaneously each carry a fixed positive flux in their forced direction at steady state. +ftINIT avoids this by using an *adaptive* per-reaction forcing magnitude +(`min(0.99·|previous flux|, force_on)`) so each essential is forced at a value it +*actually carried* in a prior feasible solution. tINIT's one-size-fits-all `eps` +mechanism doesn't have that escape hatch. + +**Practical takeaway.** For functional context-specific models on genome-scale data, use +ftINIT — the task layer (gap-fill, adaptive essential forcing) is what makes the pipeline +robust. tINIT remains useful for the small/no-essentials case (e.g. the +expression-only baseline in the validation), but pairing it with the full task-essential +set is a known incompatibility; the tINIT robustness study below is therefore reported +with `essential_rxns=[]`. + +--- + +## 2. Robustness to degraded transcriptomics (task layer always on) + +The metabolic-task + gap-fill layer is held fixed; only the expression input is degraded. +`frac` = fraction of the 69 essential tasks the extracted model performs (`check_tasks`); +`Jaccard` = reaction-set overlap with the clean-data model. + +| input | n_rxns | tasks pass | frac | Jaccard vs clean | +|-------|-------:|-----------:|-----:|-----------------:| +| **clean** | 7777 | 69/69 | 1.000 | ref | +| dropout 50% | 5968 | 67/69 | 0.971 | **0.713** | +| dropout 70% | 5113 | 68/69 | 0.986 | **0.594** | +| noise σ=1.0 | 7812 | 69/69 | 1.000 | 0.952 | +| noise σ=2.0 | 7768 | 69/69 | 1.000 | 0.919 | +| downsample 50% | 6765 | 68/69 | 0.986 | 0.815 | +| downsample 70% | 6123 | 68/69 | 0.986 | 0.728 | + +(dropout = genes set to 0 → score −5; noise = ×`exp(N(0,σ))`; downsample = genes dropped → +`no_gene_score`.) + +**Findings:** + +* **Robust to noise, sensitive to sparsity.** Multiplicative expression noise barely changes + the model (Jaccard 0.92–0.95, size stable, all tasks pass). Sparsity is far more damaging: + 50% dropout already drops the reaction set to **0.71 Jaccard** (and shrinks 7777→5968), 70% + to **0.59**. +* **Sparsity shrinks the model toward the task-essential core.** Missing/zeroed genes remove + the expression evidence for a reaction; the task layer only adds back what tasks require, so + sparse input yields smaller, more "generic" models. Dropout (−5) is harsher than + downsampling (−2). +* **Functionality is largely but not perfectly preserved.** With the task layer, `frac` stays + ≥0.97, but dips to 67–68/69 under heavy sparsity — i.e. the bounded gap-fill plus the + post-hoc low-score-gene pruning occasionally leave 1–2 essential tasks unsatisfied. (See the + lever sweep below for whether `no_gene_score`/`force_on` recover them.) +* **Cost tracks damage.** Dropout runs are slowest (more broken tasks → more gap-fill); + noise is cheap. + +> **Tractability note (a parameter that prevents failure):** the gap-fill MILP must be bounded +> (`mip_gap`/`time_limit`). Unbounded, severe degradation (which breaks many tasks at once) +> makes it solve a hard min-cost MILP per broken task to proven optimality — observed to run +> >75 min for one 90%-dropout model. With the bound it returns a near-optimal fill quickly. + +### Levers at dropout 70% — which parameter best stabilises the model? + +| config | n_rxns | frac | Jaccard vs clean | +|--------|-------:|-----:|-----------------:| +| default (no_gene_score=−2, force_on=0.1) | 5113 | 0.986 | 0.594 | +| no_gene_score=−1.0 | 5110 | 0.986 | 0.593 | +| no_gene_score=−0.5 | 5128 | 0.986 | 0.593 | +| force_on=0.2 | 5159 | 0.986 | 0.600 | + +**No lever recovers the drift** — Jaccard stays ~0.59 across all settings. Two reasons, +both informative: + +* The information dropout destroys is simply gone; no scoring/connectivity knob reconstructs + the missing expression evidence. You cannot tune your way out of sparse input. +* `no_gene_score` is the wrong knob *for dropout specifically*: dropout leaves genes + *present but zero* (scored −5), whereas `no_gene_score` only governs reactions whose genes + are **absent** from the data — i.e. the *downsampling* failure mode. So `no_gene_score` is + a meaningful lever for missing-data sparsity (a less-negative value keeps more + unmeasured reactions, growing the model back toward clean), but it has nothing to act on + under dropout. + +**Practical takeaway.** The robustness levers that matter are *structural*, not numeric: the +task + gap-fill layer (keeps the model functional regardless of input quality) and a bounded +gap-fill MILP (keeps it tractable). For *missing*-gene sparsity specifically, `no_gene_score` +trades model size against confidence. For noise, defaults are already robust. No parameter +restores fidelity lost to dropout — that is a property of the data, not the pipeline. + +### tINIT robustness — `essential_rxns=[]` (the tINIT-without-task-layer picture) + +For the reasons in §1.5, tINIT cannot accept the full task-essential set as forced +reactions; this section runs `get_init_model` with `essential_rxns=[]` to show the +realistic tINIT behaviour on the same degradation gradient — i.e. the *cost of not +having ftINIT's gap-fill safety net*. + +| input | n_rxns | tasks pass | frac | Jaccard vs clean | +|-------|-------:|-----------:|-----:|-----------------:| +| **clean** | 6277 | **35/69** | **0.507** | ref | +| dropout 50% | 4910 | 23/69 | 0.333 | 0.673 | +| dropout 70% | 2807 | 21/69 | 0.304 | 0.408 | +| noise σ=1.0 | 6661 | 25/69 | 0.362 | 0.878 | +| noise σ=2.0 | 6146 | 24/69 | 0.348 | 0.869 | +| downsample 50% | 5006 | 24/69 | 0.348 | 0.722 | +| downsample 70% | 3541 | 19/69 | 0.275 | 0.515 | + +**The headline contrast with ftINIT:** + +| | ftINIT (task layer) | tINIT (no task layer) | +|---|---|---| +| clean | 7777 rxns, **69/69 (1.000)** | 6277 rxns, **35/69 (0.507)** | +| dropout 0.7 | 5113 rxns, **68/69 (0.986)**, J 0.594 | 2807 rxns, **21/69 (0.304)**, J 0.408 | +| noise σ=2.0 | 7768 rxns, **69/69 (1.000)**, J 0.919 | 6146 rxns, **24/69 (0.348)**, J 0.869 | +| downsample 0.7 | 6123 rxns, **68/69 (0.986)**, J 0.728 | 3541 rxns, **19/69 (0.275)**, J 0.515 | + +* tINIT-without-gap-fill fails roughly **half the essential tasks even on clean data**; + ftINIT-with-gap-fill passes them all. Under degradation tINIT collapses further (down + to 19/69 at 70 % downsample), ftINIT stays ≥67/69 throughout. +* **Reaction-set drift is comparable** under noise (Jaccard 0.87 vs 0.92) but worse for + tINIT under sparsity (0.41 vs 0.59 at 70 % dropout) because there's no gap-fill to + re-add structurally needed reactions. + +This is *not* a critique of the tINIT algorithm — classic INIT was designed for the +no-task-layer case. It is the empirical evidence for why ftINIT's design choices (task ++ gap-fill, adaptive essential forcing) are the right ones for genome-scale tissue +model extraction, and why tINIT is mostly useful here as a baseline. + +#### tINIT levers at dropout 70% + +| config | n_rxns | tasks pass | frac | Jaccard vs clean | +|--------|-------:|-----------:|-----:|-----------------:| +| default (prod_weight=0.5, eps=0.1) | 2807 | 21/69 | 0.304 | 0.408 | +| prod_weight=0.0 | 2791 | 21/69 | 0.304 | 0.416 | +| prod_weight=1.0 | 3386 | 22/69 | 0.319 | 0.485 | +| prod_weight=2.0 | 3888 | 21/69 | 0.304 | 0.458 | +| eps=0.5 | 2620 | 21/69 | 0.304 | 0.391 | +| eps=1.0 | 3311 | 22/69 | 0.319 | 0.460 | + +Same conclusion as the ftINIT levers: parameter tuning can nudge (`prod_weight≥1.0` +or a larger `eps` modestly grows the model and lifts Jaccard from 0.41 to ~0.48), but +**no tINIT parameter recovers anything close to ftINIT's functionality** (22/69 at best +vs ftINIT's 67–69/69 at the same dropout). The gap-fill layer, not the parameter +choice, is what bridges the gap. + +--- + +## 3. Cross-solver portability + +See [init_solver_benchmark.md](init_solver_benchmark.md) for the genome-scale +solver comparison (Gurobi/HiGHS/GLPK) and [tests/test_init_solvers.py](../tests/test_init_solvers.py) +for CI parameterised over installed MILP backends. Headline: at genome scale only Gurobi +is viable today; HiGHS fails on an upstream optlang `hybrid_interface.clone()` bug; GLPK +ignores `configuration.timeout` on MIP and ran 1 h+ without converging. Toy-scale +correctness is portable (Gurobi + GLPK give identical verdicts on the unit-test +networks), so local development works without a Gurobi licence. + +--- + +## Reproducing + +```bash +python scripts/analyze_init_params.py --cell HCT116 --sweeps ftinit_milp,prep_scale,tinit,ftinit_full +python scripts/analyze_init_robustness.py --cell HCT116 --algo ftinit # then --algo tinit +``` + +Both reuse the cached Human-GEM preps from the validation run +([docs/humangem_validation.md](humangem_validation.md)) and are resumable. diff --git a/docs/init_solver_benchmark.md b/docs/init_solver_benchmark.md new file mode 100644 index 0000000..1cd97ac --- /dev/null +++ b/docs/init_solver_benchmark.md @@ -0,0 +1,67 @@ +# Cross-solver ftINIT benchmark — Human-GEM / HCT116 + +Same `ftinit()` call (no-task scaled prep; `mip_gap=0.001`, `time_limit=900s`) run with each +installed MILP-capable optlang interface. Generated by `scripts/analyze_init_solvers.py`; +companion to the CI-scale `tests/test_init_solvers.py`. + +## Per-solver result + +| solver | time (s) | status | n_rxns | +|--------|---------:|--------|-------:| +| **gurobi** | 518 | ✅ ok | 7752 | +| **hybrid** (HiGHS) | 55 | ❌ FAIL: `ValueError: LP Method primal is not valid (choose one of: auto, simplex, interior point)` | 0 | +| **glpk** | 3672 | ❌ FAIL: did not converge in 1 h+ (`configuration.timeout` not honored by GLPK MIP) | 0 | + +> Wall clocks on Gurobi 13.0.1, optlang 1.x, cobra; one Human-GEM HCT116 cell line. + +## Findings + +* **Gurobi** is the only MILP backend that actually completes ftINIT on Human-GEM here: + ~9 min for 7752 reactions (matches the [validation](humangem_validation.md) result). + All our tractability tuning (big-M=100, `rescaleModelForINIT`, `mip_gap`, + `time_limit`) was done on Gurobi and it pays off. +* **HiGHS** (`hybrid_interface`) **does not work with cobra at all in this stack** — not + raven-python's bug. Cobra sets `model.solver = "hybrid"` which calls + `optlang.interface.Model.clone()`, which re-applies a stored `lp_method="primal"` + parameter that the `hybrid_interface.Configuration` rejects (it accepts only + `auto/simplex/interior point`). This breaks `model.copy()` and any flow that swaps + the solver — i.e. the whole pipeline. The same failure mode shows up at toy scale in + `tests/test_init_solvers.py` (5/5 fail), so CI catches it now. Upstream optlang/cobra + patch needed; nothing to fix in raven-python. +* **GLPK** loads the model but its MIP solver does **not honor + `configuration.timeout`** for this problem — we set the 900 s wall limit, GLPK still + ran 1 h+ at 100 % CPU without producing a solution and had to be killed. GLPK has no + licensing burden but is not a viable MILP backend at genome scale for ftINIT in + practice. + +## Practical implications + +* **Production / genome-scale ftINIT requires Gurobi** today. We should be explicit + about this in the package docs (license-encumbered dependency) until either the + optlang `hybrid_interface` clone bug is fixed or GLPK gains usable MIP time-limit + support. +* **Toy / unit-test correctness is portable.** `tests/test_init_solvers.py` shows Gurobi + and GLPK give identical verdicts on the toy ftINIT/tINIT networks; the formulation + itself is solver-independent. Local development and CI work without a Gurobi license; + only the genome-scale runs need it. +* **Future portability work** is two concrete upstream fixes: + 1. optlang `hybrid_interface.Configuration` should accept (or remap) the `lp_method` + parameter values that the generic clone path emits, or the clone path should drop + unknown LP-method values gracefully. + 2. GLPK's MIP solve should honor `configuration.timeout`. If upstream won't, + raven-python could implement a watchdog (separate thread sending `SIGINT` after the + wall limit) specifically when the solver is GLPK. + +## Reproducing + +```bash +# CI parameterised tests (seconds, runs always): +python -m pytest tests/test_init_solvers.py -v + +# Genome-scale benchmark (minutes-to-hours, manual): +python scripts/analyze_init_solvers.py --cell HCT116 \ + --doc docs/init_solver_benchmark.md +``` + +Both reuse the cached Human-GEM no-task prep from the validation run +([humangem_validation.md](humangem_validation.md)) and are resumable per solver. diff --git a/docs/kegg_data_format.md b/docs/kegg_data_format.md new file mode 100644 index 0000000..efb6d13 --- /dev/null +++ b/docs/kegg_data_format.md @@ -0,0 +1,72 @@ +# KEGG relational-table storage format + +This note records *why* raven-python stores its KEGG-derived relational tables as +**gzipped TSV**, and what other options we deliberately deferred. It applies to +the maintainer-built KEGG artefacts described in PLAN.md §2.3b — the `ko_reaction`, +`organism_gene_ko`, KO-name, and reaction-flag tables. + +The reference GEM itself is stored as **gzipped RAVEN/cobra YAML** +(`reference_model.yml.gz`) — RAVEN-native and MATLAB-readable, gzipped to match the +tables (the YAML I/O transparently gzips on a `.gz` suffix). On the real KEGG dump +this is ~1.1 MB (vs ~30 MB as SBML) for the full 12k-reaction gene-free model. + +End users do not build any of this: the published artefacts are fetched and cached +under `~/.cache/raven-python/data/kegg-/` by `ensure_data` (see +`raven_python.data`), mirroring how binaries are provisioned. + +## Decision (current) + +- **Small tables** (`ko_reaction`, `ko_names`, `rxn_flags`): **gzipped TSV + (`.tsv.gz`)**. Each is well under 1 MB, so compression choice is irrelevant; + gzip keeps them MATLAB-native and dependency-free. +- **The large `organism_gene_ko` table**: **xz-compressed TSV + (`organism_gene_ko.tsv.xz`), with rows sorted by `(organism, gene)`**. + +Why the large table differs. It carries KEGG's ~9M gene↔KO associations and +dominates the artefact set (≈78 MB as unsorted gzipped TSV). Two cheap, +stdlib-only changes cut that to ≈27 MB (2.9×): + +1. **Sort by `(organism, gene)`** before writing. Gene IDs from one organism + share long common prefixes (locus tags, numeric runs); sorting makes them + adjacent so the compressor can fold them. This alone takes 78 → 48 MB and + happens to match the by-organism query pattern in + `get_kegg_model_for_organism`. The sort is an external merge sort bounded to + `chunk_rows` in memory (see `stream_organism_gene_ko`), so it stays scalable. +2. **xz instead of gzip** (Python stdlib `lzma`). Its larger dictionary captures + cross-row redundancy gzip's 32 KB window misses: sorted + xz reaches ≈27 MB. + +- **pandas reads/writes both with zero extra dependencies** — compression is + inferred from the `.gz`/`.xz` suffix; `lzma` and `gzip` are both stdlib, so + this works natively on Windows, macOS, and Linux with no external binary. +- **MATLAB caveat:** `readtable` reads gzipped TSV after a `gunzip`, but MATLAB + has no built-in xz decompressor. The small tables stay MATLAB-native; the + large table needs an external `unxz` (or Java/`7-Zip`) before `readtable` on + the MATLAB side. The xz file is raven-python's (Python) primary artefact; this + trades a little MATLAB convenience on the one big file for a ~3× size cut. + +## Options considered + +| Format | Python cost | MATLAB cost | Notes | +| --- | --- | --- | --- | +| **Gzipped TSV** ✅ | none (stdlib/pandas) | none (`readtable`) | Universal, text, types re-specified on read. Chosen. | +| Parquet | `pyarrow` or `fastparquet` (~40–60 MB wheel) as a `raven-python[kegg]` extra | needs ≥ R2019a (`parquetread`, native) | Smaller, faster, typed, columnar. Win mainly at scale / repeated random access. | +| SQLite | none (stdlib `sqlite3`) | **needs Database Toolbox** | Rejected: the MATLAB-side toolbox requirement breaks the "same files, both languages, no extra deps" goal. | + +## When to revisit + +Reconsider Parquet (or SQLite) if any of these become true: + +- The `organism_gene_ko` table grows large enough that load *time* (not just + size — the sort+xz change above already addresses on-disk size) becomes a real + bottleneck. The remaining inefficiency is that building one species' model + still loads all ~9M rows; sorted order makes a `searchsorted`/row-group + by-organism read the natural next step before reaching for Parquet. +- We start doing repeated random-access / columnar reads rather than a single + load-once-per-run pattern. +- A typed, self-describing schema becomes valuable (TSV loses dtypes; they are + re-specified on read). + +If revisited, prefer **Parquet** over SQLite (no MATLAB toolbox dependency; MATLAB +reads Parquet natively from R2019a). It could be offered as an optional +`raven-python[kegg]` extra (pyarrow) alongside the TSV default, rather than replacing +it — keeping the dependency-free path intact for users who don't opt in. diff --git a/docs/kegg_hmm_cutoff_calibration.md b/docs/kegg_hmm_cutoff_calibration.md new file mode 100644 index 0000000..43e3b3e --- /dev/null +++ b/docs/kegg_hmm_cutoff_calibration.md @@ -0,0 +1,203 @@ +# KEGG HMM-query cut-off calibration + +This note records the measurements behind the default KO-assignment parameters in +`reconstruction/kegg/query.py` (`assign_kos` / `get_kegg_model_from_sequences`, +pipeline step 3b.5) and IMPROVEMENTS **K15**. It is the evidence for moving away +from RAVEN's `1e-50` cut-off. + +## What the parameters do + +`assign_kos` turns an `hmmscan` KO×gene E-value matrix into gene→KO assignments +in three steps: + +1. **`cutoff`** — keep hits with `evalue <= cutoff`. +2. **`min_score_ratio_ko`** — within a KO, drop genes whose + `log(evalue)/log(best_evalue_in_KO) < min_score_ratio_ko`. +3. **`min_score_ratio_g`** — within a gene, drop KOs whose + `log(evalue)/log(best_evalue_for_gene) < min_score_ratio_g`. + +## Method + +- **Data:** KEGG release 118. Libraries: the maintainer-built `prokaryotes.hmm` + (831 MB) and `eukaryotes.hmm` (692 MB), 90 %-clustered, FFT-NS-2/PartTree (K12). +- **Queries:** each organism's full proteome, extracted from `genes.pep`. +- **Ground truth:** the organism's *real* KEGG gene→KO links, from the + `organism_gene_ko` table (restricted, as the table is, to reaction-linked KOs). +- **Prediction:** `assign_kos` output, with the `organism:` prefix stripped from + query gene IDs so they match the bare gene IDs in the ground truth. +- **Metrics (gene→KO level):** precision = |pred ∩ truth| / |pred|, + recall = |pred ∩ truth| / |truth|, F1. Reaction-level: `rxn_rec` = fraction of + the organism's true reactions recovered (KO→reaction via `ko_reaction`); + `rxn_novel` = predicted reactions **not** in the annotation set. +- Reproduce with [`scripts/analyze_hmm_cutoffs.py`](../scripts/analyze_hmm_cutoffs.py). + +### Important caveat + +All four organisms are **present in the libraries' training set**, so their own +sequences hit their KO profiles strongly and recall is an upper bound. The +calibration is therefore *relative* (how the parameters trade off, and where +RAVEN's default sits relative to the signal), not an absolute accuracy estimate. +A genome genuinely absent from KEGG would be the next validation. Also note that +`rxn_novel` / "precision < 1" partly reflects **legitimate homology** KEGG never +annotated for that organism (paralogs, un-curated genes), not pure error — so the +precision figures are a lower bound on real precision. + +## Organisms + +| code | organism | library | proteome (seqs) | true gene→KO pairs | true reactions | +|---|---|---|---|---|---| +| `sce` | *Saccharomyces cerevisiae* (budding yeast) | euk | 6021 | 841 | 1217 | +| `cme` | *Cyanidioschyzon merolae* (red alga) | euk | 5010 | 709 | 1157 | +| `eco` | *Escherichia coli* K-12 MG1655 | prok | 4288 | 1071 | 1548 | +| `mge` | *Mycoplasmoides genitalium* G37 (minimal genome) | prok | 476 | 110 | 211 | + +`sce`/`eco` are model organisms; `cme`/`mge` are lesser-studied, `mge` +additionally being a small, divergent genome. + +## 1. E-value separation (the key result) + +`log10(E-value)` percentiles of the best hit per (gene, KO) pair, split by whether +the pair is in the organism's annotation (**matched**) or not (**novel**). Smaller +(more negative) = stronger hit. + +| organism | group | n | p50 | p90 | p95 | p99 | +|---|---|---|---|---|---|---| +| `sce` | matched | 835 | −155 | −75 | −59 | −33 | +| `sce` | novel | 9467 | −8 | −2 | −0 | 1 | +| `cme` | matched | 704 | −133 | −63 | −47 | −25 | +| `cme` | novel | 10170 | −8 | −2 | −2 | 0 | +| `eco` | matched | 1070 | −142 | −69 | −57 | −36 | +| `eco` | novel | 27357 | −7 | −2 | −1 | −0 | +| `mge` | matched | 110 | −100 | −42 | −35 | −15 | +| `mge` | novel | 1904 | −4 | −2 | −1 | −0 | + +**Reading:** matched pairs cluster at E ≈ 1e-100…1e-155; even their weakest 1 % +sit at 1e-15…1e-36. Novel pairs cluster at ≈1e-8. The two are separated by ~20 +orders of magnitude. RAVEN's **`1e-50` lies inside the *matched* tail** (between +the matched p90 and p95 for most organisms; past p90 for `mge`), so it discards +real-but-weakly-scoring annotations while gaining nothing against the (far weaker) +noise. + +## 2. Cut-off sweep + +`min_score_ratio_ko = 0.3`, `min_score_ratio_g = 0.8` fixed; gene→KO precision / +recall / F1 and reaction recovery vs the annotation. + +### `sce` +| cutoff | gKO prec | gKO rec | gKO F1 | rxn rec | rxn novel | +|---|---|---|---|---|---| +| 1e-10 | 0.57 | 0.98 | 0.72 | 0.99 | 334 | +| 1e-20 | 0.65 | 0.98 | 0.78 | 0.97 | 283 | +| 1e-30 | 0.72 | 0.97 | 0.83 | 0.97 | 216 | +| 1e-50 | 0.78 | 0.95 | 0.86 | 0.96 | 157 | +| 1e-70 | 0.81 | 0.91 | 0.86 | 0.91 | 113 | +| 1e-100 | 0.84 | 0.76 | 0.80 | 0.79 | 68 | + +### `cme` +| cutoff | gKO prec | gKO rec | gKO F1 | rxn rec | rxn novel | +|---|---|---|---|---|---| +| 1e-10 | 0.50 | 0.98 | 0.67 | 1.00 | 541 | +| 1e-20 | 0.57 | 0.98 | 0.72 | 1.00 | 421 | +| 1e-30 | 0.61 | 0.97 | 0.75 | 0.97 | 367 | +| 1e-50 | 0.70 | 0.93 | 0.80 | 0.94 | 307 | +| 1e-70 | 0.75 | 0.85 | 0.80 | 0.87 | 223 | +| 1e-100 | 0.80 | 0.70 | 0.75 | 0.71 | 136 | + +### `eco` +| cutoff | gKO prec | gKO rec | gKO F1 | rxn rec | rxn novel | +|---|---|---|---|---|---| +| 1e-10 | 0.53 | 0.99 | 0.69 | 0.99 | 382 | +| 1e-20 | 0.57 | 0.99 | 0.73 | 0.99 | 300 | +| 1e-30 | 0.60 | 0.98 | 0.75 | 0.99 | 268 | +| 1e-50 | 0.67 | 0.95 | 0.78 | 0.98 | 198 | +| 1e-70 | 0.73 | 0.88 | 0.80 | 0.93 | 157 | +| 1e-100 | 0.82 | 0.74 | 0.77 | 0.80 | 96 | + +### `mge` +| cutoff | gKO prec | gKO rec | gKO F1 | rxn rec | rxn novel | +|---|---|---|---|---|---| +| 1e-10 | 0.52 | 0.98 | 0.68 | 0.99 | 75 | +| 1e-20 | 0.62 | 0.96 | 0.75 | 0.98 | 51 | +| 1e-30 | 0.65 | 0.95 | 0.77 | 0.98 | 39 | +| 1e-50 | 0.77 | 0.84 | 0.80 | 0.87 | 29 | +| 1e-70 | 0.85 | 0.73 | 0.78 | 0.73 | 27 | +| 1e-100 | 0.87 | 0.50 | 0.64 | 0.47 | 21 | + +**Reading:** recall is flat-and-high from 1e-10 to ~1e-30, then falls as the +cut-off eats into the matched tail — gently for model organisms, sharply for the +divergent `mge` (rxn recall 0.98 → 0.87 from 1e-30 → 1e-50, → 0.47 at 1e-100). +The recall lost to a stricter cut-off is *not* noise rejection (noise is at 1e-8); +it is real annotation. `rxn_novel` shrinks with stricter cut-offs because strong +un-annotated homologs are also removed. + +## 3. Score-ratio sweep (`cutoff = 1e-50`) + +| organism | ko ratio | g ratio | gKO prec | gKO rec | gKO F1 | +|---|---|---|---|---|---| +| `sce` | 0.0 | 0.50 | 0.61 | 0.96 | 0.74 | +| `sce` | 0.0 | 0.80 | 0.77 | 0.95 | 0.85 | +| `sce` | 0.0 | 0.95 | 0.84 | 0.93 | 0.88 | +| `sce` | 0.3 | 0.80 | 0.78 | 0.95 | 0.86 | +| `sce` | 0.5 | 0.80 | 0.80 | 0.95 | 0.86 | +| `cme` | 0.0 | 0.50 | 0.53 | 0.94 | 0.68 | +| `cme` | 0.0 | 0.80 | 0.69 | 0.93 | 0.79 | +| `cme` | 0.0 | 0.95 | 0.78 | 0.92 | 0.84 | +| `cme` | 0.3 | 0.80 | 0.70 | 0.93 | 0.80 | +| `cme` | 0.5 | 0.80 | 0.70 | 0.93 | 0.80 | +| `eco` | 0.0 | 0.50 | 0.39 | 0.96 | 0.56 | +| `eco` | 0.0 | 0.80 | 0.66 | 0.95 | 0.78 | +| `eco` | 0.0 | 0.95 | 0.76 | 0.94 | 0.84 | +| `eco` | 0.3 | 0.80 | 0.67 | 0.95 | 0.78 | +| `eco` | 0.5 | 0.80 | 0.69 | 0.95 | 0.80 | +| `mge` | 0.0 | 0.50 | 0.62 | 0.85 | 0.72 | +| `mge` | 0.0 | 0.80 | 0.77 | 0.84 | 0.80 | +| `mge` | 0.0 | 0.95 | 0.82 | 0.81 | 0.81 | +| `mge` | 0.3 | 0.80 | 0.77 | 0.84 | 0.80 | +| `mge` | 0.5 | 0.80 | 0.78 | 0.84 | 0.81 | + +**Reading:** +- **`min_score_ratio_ko` is inert** — across all four organisms, varying it + 0.0 → 0.3 → 0.5 changes precision/recall by ≤0.02 (mostly 0.00). It is a + magic-number knob that does effectively nothing here. (Full 0.0/0.3/0.5 × g-grid + in the script output; representative rows shown.) +- **`min_score_ratio_g` is the real precision lever** — 0.80 → 0.95 lifts + precision ~0.07–0.10 for ~0.02 recall loss. 0.50 is clearly too loose. + +## 4. Chosen defaults and effect + +| parameter | RAVEN / old | new default | rationale | +|---|---|---|---| +| `cutoff` | 1e-50 | **1e-30** | recovers the matched tail (esp. divergent genomes); still ~22 orders above the 1e-8 noise floor | +| `min_score_ratio_g` | 0.8 | **0.9** | the effective precision lever; offsets the looser cut-off | +| `min_score_ratio_ko` | 0.3 | 0.3 (kept) | empirically inert; retained for RAVEN parity | + +Old default `(1e-50, 0.3, 0.8)` vs new default `(1e-30, 0.3, 0.9)` +(`min_score_ratio_ko` 0.3 ≡ 0.0 here): + +| organism | gKO prec | gKO rec | rxn rec | rxn novel | +|---|---|---|---|---| +| `sce` | 0.78 → 0.76 | 0.95 → 0.96 | 0.96 → 0.96 | 157 → 137 | +| `cme` | 0.70 → 0.67 | 0.93 → 0.96 | 0.94 → 0.97 | 307 → 305 | +| `eco` | 0.67 → 0.67 | 0.95 → 0.97 | 0.98 → 0.99 | 198 → 173 | +| `mge` | 0.77 → 0.69 | **0.84 → 0.94** | **0.87 → 0.97** | 29 → 35 | + +The divergent minimal genome gains ~10 points of recall (the case the sequence +path exists for); model organisms improve slightly and `eco` emits *fewer* +unannotated reactions (the tighter gene-ratio prunes spurious multi-KO genes). The +small precision dip vs annotation is dominated by extra strong homologs, not +weak-hit noise. + +## 5. Whole-model cross-validation (sanity check) + +Full reconstruction of *S. cerevisiae* two ways, at the old defaults: + +| | annotation path (3b.4) | HMM path (3b.5) | +|---|---|---| +| reactions | 1355 | 1461 | +| metabolites | 1501 | 1567 | +| genes | 835 | 896 | + +Reaction recall 96.3 % (1305/1355 shared, Jaccard 0.86); gene recall 96.6 % +(807/835 shared, Jaccard 0.87). The annotation path also exercises the new +`organism_gene_ko.tsv.xz` artefact (K14). `hmmscan` throughput ≈ 0.1 s/query +against either library on 12 threads (yeast: 6021 queries in 633 s). diff --git a/docs/maintaining_binaries.md b/docs/maintaining_binaries.md new file mode 100644 index 0000000..df5b315 --- /dev/null +++ b/docs/maintaining_binaries.md @@ -0,0 +1,236 @@ +# Maintaining bundled binaries (BLAST+, DIAMOND, …) + +Audience: **raven-python maintainers / the GitHub repo owner.** This explains how +raven-python ships external command-line tools, how to update their versions, and how +to build **minimal-footprint** ZIPs to attach to a GitHub release. + +> End users never read this. They get a binary automatically via `ensure_binary`, +> or use their own (system/conda) install. This doc is only for whoever publishes +> the release assets. + +--- + +## 1. How binary provisioning works + +raven-python does **not** vendor binaries in the git repo or on PyPI. Instead: + +1. For each tool we publish **version-pinned ZIPs as GitHub release assets**. +2. A **registry** (`src/raven_python/binaries_registry.json`) maps each *bundle* to its + version, the executables it provides, and per-platform `{asset, sha256}`. +3. At run time `raven_python.binaries.ensure_binary("blastp")` resolves a tool in this + order — and only reaches the download as a last resort: + + ``` + explicit binary= arg → env var (RAVEN_PYTHON_BLASTP / RAVEN_PYTHON_DIAMOND / …) + → shutil.which on PATH (system / conda / apt / brew) + → ensure_binary: download the pinned ZIP → verify SHA256 → cache → return path + → actionable error (with conda / manual instructions) + ``` + +So a pre-installed binary always wins; the bundle is the zero-setup fallback. +Pinning the version makes reconstruction **reproducible**. + +A *bundle* can provide several executables from one download (e.g. the `blast` +bundle provides both `blastp` and `makeblastdb`), so they are fetched once. + +--- + +## 2. What raven-python actually needs — ship only these + +Distribute the **minimum** set of executables. Everything else (other suite +tools, docs, examples, changelogs) must be excluded. + +| Bundle | Executables to include | Everything else | +|---|---|---| +| `diamond` | `diamond` | — (it is a single static binary) | +| `blast` | `blastp`, `makeblastdb` | **drop** `blastn`, `tblastn`, `psiblast`, `rpsblast`, `blast_formatter`, `*_vdb`, the `doc/`, `ChangeLog`, `README`, ~30 other tools | + +(Confirmed against RAVEN `getBlast`/`getDiamond`: only `makeblastdb`+`blastp`, and +`diamond` for its `makedb`/`blastp` subcommands, are ever invoked.) + +For BLAST+ this is the big win: the full NCBI suite is ~hundreds of MB; two +binaries (stripped) are a small fraction. + +--- + +## 3. Asset & ZIP conventions + +**Asset filename:** `---.zip` + +- `` ∈ `linux`, `macos`, `windows` +- `` ∈ `x86_64`, `arm64` +- examples: `diamond-2.1.11-linux-x86_64.zip`, `blast-2.16.0-macos-arm64.zip` + +**ZIP layout — flat, executables at the root, plus the upstream licence:** + +``` +diamond-2.1.11-linux-x86_64.zip +├── diamond +└── LICENSE + +blast-2.16.0-linux-x86_64.zip +├── blastp +├── makeblastdb +└── LICENSE +``` + +No nested `bin/`, no extra files. `ensure_binary` extracts the ZIP into the cache +and expects the executable at the top level. + +--- + +## 4. Step-by-step: add or update a version + +Example: bump DIAMOND to a new version for Linux x86-64. Repeat per `(os, arch)`. + +1. **Download the official upstream build** (never rebuild from source unless you + must): + - DIAMOND → + (`diamond-linux64.tar.gz`, `diamond-macos.tar.gz`) + - BLAST+ → or a + pinned version dir (`ncbi-blast-+-x64-linux.tar.gz`, + `-x64-macosx.tar.gz`, `-aarch64-linux.tar.gz`, `-x64-win64.tar.gz`). + - Record the upstream URL **and** its published checksum for provenance. +2. **Extract only the needed executables** (see §2) to a clean staging dir. +3. **Strip debug symbols** to shrink (skip on Windows / signed macOS builds): + ```bash + strip diamond # or: strip blastp makeblastdb + ``` +4. **Smoke-test the stripped binaries in a clean shell** (no other tools on PATH): + ```bash + ./diamond --version + ./blastp -version && ./makeblastdb -version + ``` + If they fail for a missing shared library, add that `.so`/`.dylib` to the ZIP + (rare — NCBI/DIAMOND release builds are largely self-contained). +5. **Add the upstream licence file** as `LICENSE` (see §6). +6. **Zip with max compression, flat layout:** + ```bash + zip -9 -j diamond-2.1.11-linux-x86_64.zip diamond LICENSE + # -j junks paths so entries sit at the ZIP root + ``` +7. **Compute the SHA256:** + ```bash + sha256sum diamond-2.1.11-linux-x86_64.zip # shasum -a 256 on macOS + ``` +8. **Attach the ZIP to a raven-python GitHub release** (a release tagged for the binary + set, e.g. `binaries-2024.06`, keeps them independent of code releases). +9. **Update the registry** `src/raven_python/binaries_registry.json` — bump `version` + and set the per-platform `asset` + `sha256`: + ```json + { + "diamond": { + "version": "2.1.11", + "provides": ["diamond"], + "platforms": { + "linux-x86_64": { + "asset": "diamond-2.1.11-linux-x86_64.zip", + "url": "https://github.com/SysBioChalmers/raven-python/releases/download/binaries-2024.06/diamond-2.1.11-linux-x86_64.zip", + "sha256": "" + } + } + }, + "blast": { + "version": "2.16.0", + "provides": ["blastp", "makeblastdb"], + "platforms": { "linux-x86_64": { "asset": "...", "url": "...", "sha256": "..." } } + } + } + ``` +10. **Commit the registry change**, run the homology tests, and (if you have the + binary) confirm `ensure_binary("diamond", version="2.1.11")` downloads, + verifies, and runs. + +--- + +## 5. Keeping the footprint minimal — checklist + +- ✅ Only the executables in §2 (for BLAST+, exactly `blastp` + `makeblastdb`). +- ✅ `strip` the binaries (often halves their size). +- ✅ `zip -9 -j` (max compression, flat — no `bin/`, no folders). +- ✅ Exactly one extra file: `LICENSE`. +- ❌ No docs, examples, `ChangeLog`, `README`, man pages, test data, or sibling tools. +- ❌ No `.dSYM`/debug bundles; no duplicate static `.a` libraries. +- ➕ Only add a shared library if step-4 testing proves it is required. + +--- + +## 6. Platform / architecture matrix & licensing + +**Coverage = what you build.** Start with `linux-x86_64` (CI default), then add +`macos-arm64`, `macos-x86_64`, `linux-arm64`, `windows-x86_64` as capacity allows. +For any `(os, arch)` **not** in the registry, `ensure_binary` raises an actionable +error pointing to conda (`conda install -c bioconda diamond blast`) or a manual +install — that is the documented fallback, not a failure to fix urgently. + +**Licensing (must comply when redistributing):** + +- **BLAST+** — produced by NCBI (US Government); **public domain**, free to + redistribute. Include NCBI's `LICENSE` for courtesy/provenance. +- **DIAMOND** — **GPLv3**. Redistribution is allowed; you **must** include the + GPLv3 licence text in the ZIP and keep the binary unmodified (or offer source). +- **HMMER** (future) — BSD-3-Clause; include its `LICENSE`. + +Always ship the upstream licence in the ZIP, and keep a `BINARIES_PROVENANCE.md` +(or a note in the release body) recording, per asset: upstream URL, upstream +version, upstream checksum, and the SHA256 you published. + +### Native OS support per tool + +raven-python invokes each tool through `subprocess.run([resolved_path, …])` — that +call is itself cross-platform, so the real constraint is whether a given tool has +a binary that runs natively on each OS. It varies: + +| Tool | Linux | macOS (incl. arm64) | Windows (native) | +|---|---|---|---| +| BLAST+ (`blastp`, `makeblastdb`) | ✅ | ✅ | ✅ (NCBI ships Windows builds) | +| DIAMOND | ✅ | ✅ | ⚠️ native build exists but Linux-first | +| HMMER (`hmmbuild`/`hmmpress`/`hmmsearch`/`hmmscan`) | ✅ | ✅ | ❌ no official native build | +| MAFFT | ✅ | ✅ | ⚠️ Windows package is a wrapper | +| CD-HIT | ✅ | ✅ | ❌ no Windows build exists | + +Implications: + +- **Linux / macOS** — everything works. `conda install -c bioconda hmmer mafft + cd-hit blast diamond`, or point the `RAVEN_PYTHON_*` env vars at your installs. +- **Native Windows** — the homology track (BLAST+/DIAMOND) works, but the **KEGG + HMM build (3b.3) and HMM query (3b.5) do not**: HMMER and CD-HIT have no Windows + binaries, and bioconda has no Windows packages for any of them. Bundling can't + fix this — there is no binary to bundle. +- **Windows users should run raven-python inside WSL2** (or a Linux container), where + every tool is native Linux. raven-python does **not** replicate RAVEN's + `getWSLpath`/`wsl …` path translation: it calls the resolved binary directly, so + mixing native-Windows Python with WSL binaries is unsupported — keep the whole + stack inside WSL2. +- The common end-user paths — homology reconstruction and the KEGG *species* model + (3b.4) — need no HMMER/MAFFT/CD-HIT, so they are fully cross-platform. + +--- + +## 7. Emitting the registry entry + +After building the per-platform ZIPs (named `---.zip`) +and uploading them to the release, generate the `_REGISTRY` entry — checksums and +URLs — with [`scripts/make_registry_snippet.py`](../scripts/README.md): + +```bash +python scripts/make_registry_snippet.py binary --bundle blast --version 2.16.0 \ + --provides blastp makeblastdb --dir zips \ + --base-url https://github.com/ORG/raven-python/releases/download/blast-2.16.0 +``` + +It prints the ready-to-paste `_REGISTRY["blast"]` block; its SHA256 helper is the +same one `ensure_binary` verifies with, so the checksums always match. (Producing +the minimal ZIPs themselves — download upstream, `strip`, `zip -9 -j`, add +`LICENSE` per §3–§6 — is still a manual/per-tool step.) + +--- + +## 8. Adding a new tool later (e.g. HMMER for KEGG reconstruction) + +1. Decide the **minimal executable set** (e.g. HMMER → `hmmsearch`, `hmmscan`, + maybe `hmmbuild`/`hmmpress`). +2. Add a bundle entry to the registry with `provides` listing those executables. +3. Build/attach ZIPs per §3–§4; include the tool's licence (§6). +4. The wrappers call `ensure_binary("hmmsearch", …)` with the same resolution + order — no new provisioning code needed. diff --git a/docs/maintaining_kegg_data.md b/docs/maintaining_kegg_data.md new file mode 100644 index 0000000..f53d0da --- /dev/null +++ b/docs/maintaining_kegg_data.md @@ -0,0 +1,157 @@ +# Maintaining the KEGG data artefacts + +This guide is for the **package maintainer** who rebuilds raven-python's KEGG +artefacts once per KEGG release. End users never do this — they download the +published, version-pinned artefacts. The build has two implemented steps so far: +**3b.1 download** (`reconstruction/kegg/download.py`) and **3b.2 parse** +(`reconstruction/kegg/parse.py`); see PLAN.md §2.3b for the full pipeline. + +## Prerequisites + +### A paid KEGG FTP subscription +The bulk KEGG dump is licensed. You need an active subscription to +`ftp.kegg.net`, which gives you a **username and password**. + +### Credentials in `~/.netrc` +The download reads your KEGG username and password from a `~/.netrc` file — it +never takes them on the command line, so they stay out of your shell history and +out of `ps` output. Create the file (readable only by you) and add a `machine` +line for the KEGG host: + +```bash +touch ~/.netrc && chmod 600 ~/.netrc +``` + +Then add this single line to `~/.netrc`, substituting your subscription +credentials: + +``` +machine ftp.kegg.net login YOUR_KEGG_USER password YOUR_KEGG_PASSWORD +``` + +Notes: +- The host **must be `ftp.kegg.net`** — that is the machine name the downloader + looks up. A `machine` line for any other host is ignored. +- The file **must be mode `600`** (owner read/write only). Python's `netrc` + parser refuses a `.netrc` that other users can read. +- `~/.netrc` is the same convention `curl`, `wget` and `git` use, so if you + already have one, just add the `ftp.kegg.net` line to it. + +If you keep secrets somewhere other than `$HOME`, point the downloader at a +different file with `netrc_path=...` (see below); the format is identical. + +## Step 3b.1 — download and arrange the dump + +With `~/.netrc` in place, no credentials need to be passed in code: + +```python +from raven_python.reconstruction.kegg import download_kegg_dump + +# Reads ~/.netrc, fetches the KEGG archives, extracts and arranges them. +download_kegg_dump("keggdb") +``` + +This fetches the reaction / compound / glycan / ko archives, the eukaryote and +prokaryote proteomes, and the taxonomy file; extracts them; and arranges the +flat layout the parser expects (`reaction`, `reaction.lst`, +`reaction_mapformula.lst`, `compound` = compound + glycan, `compound.inchi`, +`ko`, `genes.pep` = both proteomes, `taxonomy`). + +Credential alternatives: + +```python +# A .netrc in a non-default location: +download_kegg_dump("keggdb", netrc_path="/run/secrets/kegg_netrc") + +# Pass credentials explicitly (only when they come from a secret manager at +# runtime — never hardcode literals in committed code): +download_kegg_dump("keggdb", auth=("YOUR_KEGG_USER", "YOUR_KEGG_PASSWORD")) +``` + +Already-downloaded files are skipped; pass `force=True` to re-fetch (for a new +KEGG release). + +## Step 3b.2 — parse into the published artefacts + +```python +from raven_python.reconstruction.kegg import parse_kegg_dump + +parse_kegg_dump("keggdb", "artefacts") +``` + +This writes the gene-free reference model (`reference_model.yml.gz`, gzipped +RAVEN/cobra YAML) and the relational tables as gzipped TSV. See +[kegg_data_format.md](kegg_data_format.md) for what those tables contain and the +format rationale. + +## Step 3b.3 — build the HMM libraries + +Build the per-domain profile-HMM libraries that the de-novo query path (3b.5) +searches. This needs **HMMER** (`hmmbuild`, `hmmpress`), **MAFFT**, and +**CD-HIT** on `PATH` (or set `RAVEN_PYTHON_HMMBUILD` / `RAVEN_PYTHON_MAFFT` / +`RAVEN_PYTHON_CDHIT`, etc.); install e.g. `conda install -c bioconda hmmer mafft cd-hit`. + +> **OS note:** these three tools run on Linux and macOS but **not native +> Windows** — on Windows, run this step inside WSL2. See the native-OS-support +> matrix in [maintaining_binaries.md](maintaining_binaries.md#native-os-support-per-tool). + +```python +from raven_python.reconstruction.kegg import build_hmm_library, read_kegg_table + +organism_gene_ko = read_kegg_table("artefacts/organism_gene_ko.tsv.xz") +for domain in ("prokaryotes", "eukaryotes"): + build_hmm_library( + organism_gene_ko, + "keggdb/genes.pep", # proteomes from 3b.1 + "keggdb/taxonomy", # domain split, from 3b.1 + f"hmms/{domain}", + domain=domain, + ) +``` + +For each KO in the domain it gathers the member sequences, dereplicates with +CD-HIT (~90 % identity), aligns with MAFFT, trains a profile with `hmmbuild`, and +finally concatenates and `hmmpress`-es them into a single `library.hmm` for fast +`hmmscan` querying. This is the slowest step (hours, once per KEGG release); it +skips KOs whose `.hmm` already exists, so it is resumable. The resulting +libraries are published as version-pinned artefacts alongside the reference model +and tables. + +## Building and publishing in one go + +[`scripts/build_kegg_artefacts.py`](../scripts/README.md) runs 3b.2 (+ 3b.3 with +`--hmms`) and lays the output out as publishable assets (`.hmm` named for +`ensure_kegg_hmm_library`): + +```bash +python scripts/build_kegg_artefacts.py --keggdb keggdb --out artefacts --hmms --threads 8 +``` + +Upload the contents of `artefacts/` to a release, then emit the registry entry for +`raven_python.data._DATA_REGISTRY` with [`scripts/make_registry_snippet.py`](../scripts/README.md): + +```bash +python scripts/make_registry_snippet.py data --dataset kegg --version kegg116 \ + --dir artefacts --base-url https://github.com/ORG/raven-python/releases/download/kegg-data-kegg116 +``` + +Paste the printed block into `_DATA_REGISTRY`; from then on `ensure_data` fetches +and verifies the artefacts for end users automatically. + +## End-user paths (3b.4 / 3b.5) + +End users do **not** run the steps above — the published artefacts are fetched and +cached automatically by `ensure_data` (`raven_python.data`) under +`~/.cache/raven-python/data/kegg-/` on first use, so the entry points below +can be called with no local paths at all (pass an explicit `artefact_dir=`/ +`library=` to use your own build instead). Two runtime entry points build a draft +model from the artefacts: + +- **3b.4 — species in KEGG** (`get_kegg_model_for_organism_from_artefacts`): no + binaries needed; uses the organism's KEGG gene↔KO annotations. Fully + cross-platform. `organism_id="prokaryotes"`/`"eukaryotes"` builds a whole-domain + model (pass `taxonomy=`). +- **3b.5 — organism not in KEGG** (`get_kegg_model_from_sequences`): `hmmscan`-es a + proteome FASTA against the pressed `library.hmm`, so it needs **HMMER** + (`hmmscan`) — Linux/macOS or WSL2 (see the OS matrix). Tune assignment with + `cutoff`, `min_score_ratio_ko`, `min_score_ratio_g`. diff --git a/docs/yeast_localization_benchmark.md b/docs/yeast_localization_benchmark.md new file mode 100644 index 0000000..de75566 --- /dev/null +++ b/docs/yeast_localization_benchmark.md @@ -0,0 +1,148 @@ +# yeast-GEM localisation benchmark + +Real-data validation of [`localization.predict_localization`](../src/raven_python/localization/predict.py) +on the curated yeast-GEM. The benchmark is end-to-end — model, scoring, MILP — and +sweeps predictor noise so the failure modes are visible, not just the headline accuracy. + +* Driver: [`scripts/benchmark_localization_yeast.py`](../scripts/benchmark_localization_yeast.py) +* Yeast-GEM source: `pcSecYeastSpecies/Model/yeastGEM.xml` (3991 reactions, 1147 genes, + 14 compartments). +* Run command: + ```bash + python scripts/benchmark_localization_yeast.py \ + --yeast-gem ~/github/pcSecYeastSpecies/Model/yeastGEM.xml \ + --noise 0,0.1,0.25,0.5 \ + --max-reactions 300 \ + --transport-cost 0.05 \ + --time-limit 300 \ + --doc docs/yeast_localization_benchmark.md + ``` + +## Setup + +1. **Truth set**: every yeast-GEM reaction that (a) has a GPR, (b) is non-boundary, + and (c) has all metabolites in the same compartment. 2 155 reactions qualify; + stratified subsampling to 298 keeps the per-compartment distribution. The 14 + compartments collapse to 12 placement targets in the truth set (extracellular and + the lipid particle / vacuolar membrane variants stay distinct). +2. **Flattening**: the model is squashed into one compartment with + [`manipulation.merge_compartments`](../src/raven_python/manipulation/compartments.py) + so the predictor cannot lean on metabolite-topology evidence. Without this step + every GPR'd reaction's "predicted" compartment is just its current one — vacuous. +3. **Reference scores**: each gene gets `1.0` in every compartment that hosts one of + its reactions in the original (multi-compartment) model. This is the + *perfect-predictor* upper bound; real WoLF PSORT / DeepLoc output will be noisier. +4. **Noise injection**: at noise level `p` each gene independently has probability + `p` of having a confidently *wrong* compartment grafted in as the new top score + (the true compartment is demoted to half its score). This simulates a predictor + that's right `1-p` of the time and confidently wrong otherwise — a more + pessimistic stand-in than uniform Gaussian jitter. +5. **MILP**: `transport_cost=0.05`, `multi_compartment_penalty=0.5`, `mip_gap=0.01`, + `time_limit=300s`, Gurobi. The MILP has 7 982 binaries, 2 691 rows, 29 842 + nonzeros at this scale — solves in 30–50 s. + +## Accuracy vs. predictor noise + +Accuracy = fraction of relocated reactions that the MILP places back in the truth +compartment. + +| noise | seconds | n_total | n_correct | n_unplaced | accuracy | +|------:|--------:|--------:|----------:|-----------:|---------:| +| 0.00 | 46 | 298 | 213 | 0 | 0.715 | +| 0.10 | 34 | 298 | 199 | 0 | 0.668 | +| 0.25 | 41 | 298 | 175 | 0 | 0.587 | +| 0.50 | 31 | 298 | 115 | 0 | 0.386 | + +Monotone degradation, no MILP infeasibilities at any noise level. At 10 % confident +mis-scoring the accuracy drops only ~4.7 pp — the algorithm largely shrugs off small +predictor noise because each compartment's evidence is the sum of all its genes' +scores, so a few wrong genes get out-voted by their neighbours. At 50 % the algorithm +is still better than the 1/12 = 8.3 % uniform baseline, but the loss is steep. + +## Confusion matrix at noise=0.00 + +Rows = curated (true) compartment; columns = predicted. The `c` column dominates +because cytosolic genes are also active in many other compartments (so an mm-only +reaction shares its genes with cytosolic reactions, and the algorithm picks `c`). + +| true \ pred | c | ce | er | erm | g | gm | lp | m | mm | p | v | vm | +|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:| +| **c** | 91 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | +| **ce** | 4 | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | +| **e** | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | +| **er** | 6 | 0 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | +| **erm** | 18 | 0 | 0 | 30 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | +| **g** | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | +| **gm** | 4 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | +| **lp** | 0 | 0 | 0 | 0 | 0 | 0 | 21 | 0 | 0 | 0 | 0 | 0 | +| **m** | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 20 | 0 | 0 | 0 | 0 | +| **mm** | 38 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | +| **n** | 3 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | +| **p** | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | 0 | 0 | +| **v** | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | +| **vm** | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | + +## Per-compartment accuracy at noise=0.00 + +| compartment | n | n_correct | accuracy | +|---|---:|---:|---:| +| c | 91 | 91 | 1.000 | +| ce | 15 | 11 | 0.733 | +| e | 1 | 0 | 0.000 | +| er | 13 | 7 | 0.538 | +| erm | 48 | 30 | 0.625 | +| g | 2 | 2 | 1.000 | +| gm | 7 | 3 | 0.429 | +| lp | 21 | 21 | 1.000 | +| m | 28 | 20 | 0.714 | +| mm | 41 | 3 | 0.073 | +| n | 6 | 0 | 0.000 | +| p | 16 | 16 | 1.000 | +| v | 1 | 1 | 1.000 | +| vm | 8 | 8 | 1.000 | + +## What the failures mean + +* **Compartments with self-contained gene sets are perfect.** `c`, `g`, `lp`, `p`, + `v`, `vm` reach 100 %: their gene sets are largely disjoint from `c`'s, so the + per-compartment evidence sum picks them cleanly. +* **Inner-membrane vs. matrix collapses to cytosol.** The mitochondrial inner + membrane (`mm`, 41 reactions, 7.3 % correct) and nucleus (`n`, 6 reactions, 0 % + correct) lose to `c` because their genes are *also* annotated to cytosolic + reactions. The algorithm sees gene `X` with score `1.0` in both `c` and `mm`, + and `c` wins on the larger pool of co-localised reactions. This is faithful to + the predictor evidence — a real WoLF PSORT / DeepLoc score table that distinguishes + inner-membrane from matrix would do better here, but the + derive-scores-from-the-model harness can't see that distinction. +* **Membrane / non-membrane pairs split correctly.** `erm` vs `er`, `gm` vs `g` — + the algorithm prefers the membrane sub-compartment when its genes are more + membrane-typical, which the score harness reproduces. 60–75 % is honest signal. +* **No MILP infeasibilities.** Even at 50 % confident mis-scoring every reaction + gets placed (the unplaced column stays 0). + +## Calibration insight: `transport_cost` matters + +The first smoke run used `transport_cost=0.5` (the default) and dumped almost every +reaction into `c`. With ~5 metabolites per reaction the per-reaction transport bill +overwhelmed the unit-scale gene reward, so the optimiser's best move was always +"keep it in the default compartment, pay no transports." Dropping to +`transport_cost=0.05` restored the per-compartment signal. For a real predictor with +typical score magnitudes ≪ 1, the user should expect to dial `transport_cost` *down* +into the same per-metabolite-per-compartment range as the typical gene-score-delta — +the doc-string default of 0.5 is sized for clean integer-style scores, not +soft-probability output. + +## Reproducing + +Make the smoke fast (subsampled, small noise grid): +```bash +python scripts/benchmark_localization_yeast.py \ + --noise 0,0.25 --max-reactions 100 --time-limit 60 +``` + +Plug in a real predictor (CSV of `gene_id` + one column per compartment): +```bash +python scripts/benchmark_localization_yeast.py \ + --scores-csv my_deeploc_yeast.csv \ + --noise 0 --max-reactions 300 +``` diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..faeeb1f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,84 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "raven-python" +version = "0.0.1" +description = "Reconstruction, Analysis and Visualization of Metabolic Networks in Python, a port of the RAVEN Toolbox built on cobrapy" +readme = "README.md" +requires-python = ">=3.11" +license = { text = "GPL-3.0-or-later" } +authors = [ + { name = "Eduard Kerkhoven", email = "eduardk@chalmers.se" }, +] +keywords = [ + "genome-scale-model", + "metabolic-model", + "reconstruction", + "raven", + "cobra", + "systems-biology", + "constraint-based-modeling", + "kegg", + "metacyc", + "tinit", +] +classifiers = [ + "Development Status :: 2 - Pre-Alpha", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Topic :: Scientific/Engineering :: Bio-Informatics", +] +dependencies = [ + "cobra>=0.29", + "numpy>=1.21", + "pandas>=2", + "scipy>=1.10", + "ruamel.yaml>=0.17", + "requests>=2.28", + "tqdm>=4.65", +] + +[project.optional-dependencies] +dev = [ + "pytest>=7", + "pytest-cov", + "ruff>=0.4", +] +excel = [ + "openpyxl>=3.1", +] +plotting = [ + "matplotlib>=3.5", +] + +[project.urls] +Homepage = "https://github.com/SysBioChalmers/raven-python" +Source = "https://github.com/SysBioChalmers/raven-python" +Issues = "https://github.com/SysBioChalmers/raven-python/issues" +"RAVEN MATLAB" = "https://github.com/SysBioChalmers/RAVEN" + +[tool.hatch.build.targets.wheel] +packages = ["src/raven_python"] + +[tool.pytest.ini_options] +testpaths = ["tests"] + +[tool.ruff] +line-length = 100 +target-version = "py311" + +[tool.ruff.lint] +select = ["E", "F", "W", "I", "UP", "B"] +ignore = [ + "E501", # line length handled by the formatter +] + +[tool.ruff.lint.per-file-ignores] +"tests/*" = ["E402"] diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..67a3b36 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,35 @@ +# Maintainer scripts + +Release-time tooling. Not part of the installed package — run them from a checkout +with raven-python installed (`pip install -e .`). End users never need these. + +## `build_kegg_artefacts.py` + +Build the publishable KEGG artefact set from an arranged KEGG dump (see +`download_kegg_dump`): the gzipped-YAML reference model, the gzipped-TSV tables, +and (with `--hmms`) the per-domain pressed HMM libraries. Output is laid out ready +to upload as release assets. See [docs/maintaining_kegg_data.md](../docs/maintaining_kegg_data.md). + +```bash +python scripts/build_kegg_artefacts.py --keggdb keggdb --out artefacts # tables + model +python scripts/build_kegg_artefacts.py --keggdb keggdb --out artefacts --hmms --threads 8 +``` + +## `make_registry_snippet.py` + +After uploading the files to a release, compute their SHA256 and print the entry +to merge into the runtime registry — `raven_python.data._DATA_REGISTRY` (data) or +`raven_python.binaries._REGISTRY` (binary ZIP bundles). The checksum helper is shared +with the resolvers, so published checksums always match what `ensure_data` / +`ensure_binary` verify. + +```bash +# Data artefacts: +python scripts/make_registry_snippet.py data --dataset kegg --version kegg116 \ + --dir artefacts --base-url https://github.com/ORG/raven-python/releases/download/kegg-data-kegg116 + +# Binary bundle (ZIPs named ---.zip): +python scripts/make_registry_snippet.py binary --bundle blast --version 2.16.0 \ + --provides blastp makeblastdb --dir zips \ + --base-url https://github.com/ORG/raven-python/releases/download/blast-2.16.0 +``` diff --git a/scripts/analyze_hmm_cutoffs.py b/scripts/analyze_hmm_cutoffs.py new file mode 100644 index 0000000..654fc02 --- /dev/null +++ b/scripts/analyze_hmm_cutoffs.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 +"""Cut-off sensitivity for the KEGG HMM query path (step 3b.5). + +Cross-validates ``assign_kos`` against an organism's *real* KEGG gene→KO +annotation (from the ``organism_gene_ko`` table) and sweeps the E-value cut-off +and the two score-ratio filters. Produces the tables in +``docs/kegg_hmm_cutoff_calibration.md``. + +Usage +----- + python scripts/analyze_hmm_cutoffs.py \ + --artefacts ~/keggdb_artefacts \ + --proteome /path/to/org.pep \ + --org sce --library ~/keggdb_artefacts/eukaryotes.hmm + +``--proteome`` is the organism's protein FASTA (headers ``>org:gene ...``, e.g. +extracted from KEGG ``genes.pep``). ``--tblout`` may be given instead of +``--library`` to reuse a cached ``hmmscan --tblout`` file. Requires ``hmmscan`` +on PATH or via ``RAVEN_PYTHON_HMMER`` when ``--library`` is used. + +Caveat: organisms present in the library's training set give an upper bound on +recall; the comparison is relative (see the doc). +""" +from __future__ import annotations + +import argparse +from pathlib import Path + +import numpy as np +import pandas as pd + +from raven_python.reconstruction.kegg.parse import read_kegg_table +from raven_python.reconstruction.kegg.query import ( + assign_kos, + parse_hmmscan_tblout, + run_hmmscan, +) + +CUTOFFS = (1e-10, 1e-20, 1e-30, 1e-50, 1e-70, 1e-100) +KO_RATIOS = (0.0, 0.3, 0.5) +G_RATIOS = (0.5, 0.8, 0.95) + + +def load_ko2rxn(artefacts: Path) -> dict[str, set[str]]: + tbl = read_kegg_table(artefacts / "ko_reaction.tsv.gz") + ko2rxn: dict[str, set[str]] = {} + for ko, rxn in zip(tbl["ko"], tbl["reaction"], strict=True): + ko2rxn.setdefault(ko, set()).add(rxn) + return ko2rxn + + +def ground_truth(artefacts: Path, org: str, ko2rxn) -> tuple[set, set]: + ogk = read_kegg_table(artefacts / "organism_gene_ko.tsv.xz") + rows = ogk[ogk["organism"].str.lower() == org] + pairs = set(zip(rows["gene"], rows["ko"], strict=True)) + rxns = {r for _, ko in pairs for r in ko2rxn.get(ko, ())} + return pairs, rxns + + +def predicted_pairs(hits: pd.DataFrame, **kw) -> set: + out = set() + for ko, genes in assign_kos(hits, **kw).items(): + for g in genes: + out.add((g.split(":", 1)[1] if ":" in g else g, ko)) + return out + + +def prf(pred: set, truth: set) -> tuple[float, float, float]: + tp = len(pred & truth) + rec = tp / len(truth) if truth else 0.0 + prec = tp / len(pred) if pred else 0.0 + f1 = 2 * prec * rec / (prec + rec) if prec + rec else 0.0 + return prec, rec, f1 + + +def main(argv=None) -> None: + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("--artefacts", type=Path, required=True) + ap.add_argument("--org", required=True, help="KEGG organism code, e.g. sce") + ap.add_argument("--proteome", type=Path, help="protein FASTA (headers >org:gene)") + ap.add_argument("--library", type=Path, help="pressed HMM library for hmmscan") + ap.add_argument("--tblout", type=Path, help="cached hmmscan --tblout (skip hmmscan)") + ap.add_argument("--threads", type=int, default=4) + args = ap.parse_args(argv) + + if args.tblout: + text = args.tblout.read_text() + elif args.library and args.proteome: + text = run_hmmscan(args.proteome, args.library, threads=args.threads) + else: + ap.error("give --tblout, or --library and --proteome") + + org = args.org.lower() + hits = parse_hmmscan_tblout(text) + hits = hits[hits["gene"].str.startswith(f"{org}:")].reset_index(drop=True) + ko2rxn = load_ko2rxn(args.artefacts) + gt_pairs, gt_rxns = ground_truth(args.artefacts, org, ko2rxn) + + print(f"\n{'='*70}\n{org}: {hits['gene'].nunique()} query genes with hits, " + f"{len(gt_pairs)} true gene->KO pairs, {len(gt_rxns)} true reactions\n{'='*70}") + + best: dict[tuple, float] = {} + for ko, gene, e in zip(hits["ko"], hits["gene"], hits["evalue"], strict=True): + key = (gene.split(":", 1)[1], ko) + if key not in best or e < best[key]: + best[key] = e + matched = np.array([e for k, e in best.items() if k in gt_pairs]) + novel = np.array([e for k, e in best.items() if k not in gt_pairs]) + + def logq(arr, q): + if not len(arr): + return float("nan") + v = np.quantile(arr, q) + return np.log10(v) if v > 0 else -300.0 + + print("\nlog10(E-value) percentiles [matched=in annotation, novel=not]:") + print(f" {'group':<8}{'n':>7}{'p50':>8}{'p90':>8}{'p95':>8}{'p99':>8}") + for name, arr in (("matched", matched), ("novel", novel)): + print(f" {name:<8}{len(arr):>7}{logq(arr,.5):>8.0f}{logq(arr,.9):>8.0f}" + f"{logq(arr,.95):>8.0f}{logq(arr,.99):>8.0f}") + + print("\ncutoff sweep (min_score_ratio_ko=0.3, min_score_ratio_g=0.8):") + print(f" {'cutoff':>8}{'gKO_prec':>9}{'gKO_rec':>8}{'gKO_F1':>8}{'rxn_rec':>9}{'rxn_novel':>10}") + for cutoff in CUTOFFS: + pred = predicted_pairs(hits, cutoff=cutoff) + prec, rec, f1 = prf(pred, gt_pairs) + pred_rxns = {r for _, ko in pred for r in ko2rxn.get(ko, ())} + rrec = len(pred_rxns & gt_rxns) / len(gt_rxns) if gt_rxns else 0.0 + print(f" {cutoff:>8.0e}{prec:>9.2f}{rec:>8.2f}{f1:>8.2f}{rrec:>9.2f}" + f"{len(pred_rxns - gt_rxns):>10}") + + print("\nratio sweep (cutoff=1e-50):") + print(f" {'ko_ratio':>9}{'g_ratio':>8}{'gKO_prec':>9}{'gKO_rec':>8}{'gKO_F1':>8}") + for rko in KO_RATIOS: + for rg in G_RATIOS: + pred = predicted_pairs(hits, cutoff=1e-50, + min_score_ratio_ko=rko, min_score_ratio_g=rg) + prec, rec, f1 = prf(pred, gt_pairs) + print(f" {rko:>9.1f}{rg:>8.2f}{prec:>9.2f}{rec:>8.2f}{f1:>8.2f}") + + +if __name__ == "__main__": + main() diff --git a/scripts/analyze_init_params.py b/scripts/analyze_init_params.py new file mode 100644 index 0000000..1059006 --- /dev/null +++ b/scripts/analyze_init_params.py @@ -0,0 +1,367 @@ +#!/usr/bin/env python3 +"""Parameter calibration for (f)tINIT — intrinsic speed/quality sweeps (Phase 4d.7). + +Genome-scale benchmark that sweeps the MILP/conditioning parameters of raven_python's +:func:`raven_python.init.run_ftinit`, :func:`raven_python.init.ftinit`, :func:`run_init`, and +:func:`prep_init_model` and records, for each value, the *intrinsic* trade-off: wall-clock +solve time, the MILP objective, and how far the result drifts from the tightest-setting +("reference") run — both in objective (relative gap) and in the **kept-reaction set** +(Jaccard). No external (RAVEN) reference is used: the question answered here is "what is +the loosest / cheapest setting that still reproduces the tight-setting solution?". + +Why reaction-set drift matters: a MIP gap g only guarantees the *objective* is within g of +optimal; the *which-reactions* answer can jump between alternate optima well before the +objective moves. For a model-extraction tool the reaction set is the product, so we track +its stability explicitly. + +Sweeps (select with ``--sweeps``; each is resumable — results are pickled per config and a +re-run skips finished ones): + +* ``ftinit_milp`` — single staged-MILP step (step 0 of series '1+1') on the merged model. + Cheap (~30-200 s each); the core sweep for ``mip_gap``/``big_m``/``force_on``. +* ``prep_scale`` — rescaleModelForINIT on/off and its ``max_stoich_diff``, fed into the + same step-0 MILP. Shows why scaling is needed for a fixed big-M. +* ``tinit`` — full ``get_init_model`` (classic INIT). Sweeps ``mip_gap``/``eps``/ + ``prod_weight``/``big_m``. Expensive — uses a tight ``time_limit``. +* ``ftinit_full`` — the whole ``ftinit`` pipeline (both steps + gap-fill). Sweeps + ``mip_gap``/``big_m``. Expensive (~200 s+/config). + +Usage +----- + python scripts/analyze_init_params.py \ + --work ~/hgem_compare --cell HCT116 --sweeps ftinit_milp,prep_scale + +``--work`` holds ``raven_refModel.xml`` and the Human-GEM-derived spont/custom inputs +(see the Human-GEM validation run). Requires a MILP solver (Gurobi/HiGHS) on the cobra +config. Produces a results pickle and prints a table per sweep; feed the tables into +docs/init_param_calibration.md. +""" +from __future__ import annotations + +import argparse +import pickle +import time +from dataclasses import dataclass, field +from pathlib import Path + +import cobra + +from raven_python.init import ( + ftinit, + gene_scores_from_expression, + get_init_model, + prep_init_model, + score_reactions_from_genes, +) +from raven_python.init.ftinit import run_ftinit +from raven_python.init.merge import group_rxn_scores +from raven_python.init.prep import rescale_for_init +from raven_python.init.steps import get_init_steps + +# Sweep grids (first value of each tolerance sweep is the tight "reference"). +MIP_GAPS = (0.0002, 0.001, 0.003, 0.01, 0.03, 0.1) +BIG_MS = (100.0, 50.0, 25.0, 250.0, 1000.0) +FORCE_ONS = (0.1, 0.02, 0.05, 0.2, 0.5) +MAX_STOICH = (25.0, 10.0, 50.0, 100.0) +EPS_VALS = (1.0, 0.1, 0.5, 2.0) +PROD_WEIGHTS = (0.5, 0.0, 0.25, 1.0) + +# "Recommended = cheapest config within these of the reference" thresholds. +TOL_OBJ = 0.005 # relative objective gap +TOL_JAC = 0.99 # kept-reaction-set Jaccard + + +@dataclass +class Result: + """One config's outcome (reaction set stored sorted for pickling/Jaccard).""" + + label: str + seconds: float + status: str + objective: float + n_kept: int + reactions: list[str] = field(default_factory=list) + rel_obj_gap: float | None = None # vs the sweep reference + jaccard: float | None = None # vs the sweep reference + + +def _jaccard(a: set[str], b: set[str]) -> float: + return len(a & b) / len(a | b) if (a or b) else 1.0 + + +def _load_inputs(work: Path, human_gem: Path, cell: str): + ref = cobra.io.read_sbml_model(str(work / "raven_refModel.xml")) + ref.solver = cobra.Configuration().solver + spont = [] + with open(human_gem / "model" / "reactions.tsv") as f: + hdr = f.readline().rstrip("\n").split("\t") + ci = hdr.index("spontaneous") + for line in f: + p = line.rstrip("\n").split("\t") + if p[ci] == "1": + spont.append(p[0]) + protein = [f"MAR0{n}" for n in (5155, 5156, 5161, 5167, 5168, 5169, 5170, 5171, 5172, + 5174, 5260, 5262, 5264, 5266, 5267, 5268, 5269, 5270, 5271, 5273, 5275, 5277, + 5279, 5281, 5283, 5291)] + ["MAR09817", "MAR09818"] + pool = ["MAR00011", "MAR00012", "MAR00477", "MAR05233", "MAR05234", "MAR05238", + "MAR05239", "MAR05243", "MAR05244", "MAR05247", "MAR09022", "MAR00015", + "MAR00016", "MAR00017", "MAR10033", "MAR10035", "MAR10036", "MAR10037", + "MAR10038", "MAR10062", "MAR10063", "MAR10064", "MAR10065", "MAR13082"] + custom = sorted(set(protein) | set(pool)) + expr: dict[str, float] = {} + with open(human_gem / "data" / "datasets" / "Hart2015_RNAseq.txt") as f: + h = f.readline().rstrip("\n").split("\t") + c = h.index(cell) + for line in f: + p = line.rstrip("\n").split("\t") + expr[p[0]] = float(p[c]) + gene_scores = gene_scores_from_expression(expr, 1.0) + rxn_scores = score_reactions_from_genes(ref, gene_scores) + return ref, spont, custom, gene_scores, rxn_scores + + +def _step0(prep, rxn_scores): + """The scores/flags for step 0 of series '1+1' (the cheap single-MILP probe).""" + step = get_init_steps("1+1")[0] + to_zero = prep.masks.ignored(step.ignore_mask) + scores = group_rxn_scores(prep.min_model, rxn_scores, prep.orig_rxn_ids, + prep.group_ids, to_zero) + return step, scores + + +def _run_step0(min_model, scores, prep, step, **kw) -> Result: + t = time.time() + res = run_ftinit(min_model, scores, essential_rxns=set(prep.essential_rxns), + allow_excretion=step.allow_met_secr, rem_pos_rev=step.pos_rev_off, + ignore_mets=step.mets_to_ignore, **kw) + return Result(label="", seconds=time.time() - t, status="ok", + objective=res.objective, n_kept=len(res.on_reactions), + reactions=sorted(res.on_reactions)) + + +def _finalize(results: list[Result]) -> None: + """Fill rel_obj_gap / jaccard against the first result (the reference).""" + ref = results[0] + ref_set = set(ref.reactions) + for r in results: + r.rel_obj_gap = (ref.objective - r.objective) / abs(ref.objective) if ref.objective else 0.0 + r.jaccard = _jaccard(set(r.reactions), ref_set) + + +def _recommend(results: list[Result]) -> str: + """Cheapest config (after the reference) within both tolerances; '-' if none.""" + ok = [r for r in results[1:] + if r.status == "ok" and abs(r.rel_obj_gap or 1) <= TOL_OBJ and (r.jaccard or 0) >= TOL_JAC] + return min(ok, key=lambda r: r.seconds).label if ok else "-" + + +def _print_table(title: str, results: list[Result], note: str = "") -> list[str]: + lines = [f"### {title}", ""] + if note: + lines += [note, ""] + lines.append("| config | time (s) | status | objective | n_kept | rel.obj.gap | Jaccard vs ref |") + lines.append("|---|--:|---|--:|--:|--:|--:|") + for r in results: + gap = "ref" if r is results[0] else (f"{r.rel_obj_gap:+.4f}" if r.rel_obj_gap is not None else "") + jac = "ref" if r is results[0] else (f"{r.jaccard:.4f}" if r.jaccard is not None else "") + lines.append(f"| {r.label} | {r.seconds:.0f} | {r.status} | {r.objective:.1f} | " + f"{r.n_kept} | {gap} | {jac} |") + rec = _recommend(results) + lines += ["", f"Cheapest config within obj≤{TOL_OBJ:.1%} & Jaccard≥{TOL_JAC} of ref: **{rec}**", ""] + for ln in lines: + print(ln) + return lines + + +# --------------------------------------------------------------------------- sweeps + +def sweep_ftinit_milp(prep, rxn_scores, store, save) -> list: + step, scores = _step0(prep, rxn_scores) + mm = prep.min_model + doc: list[str] = [] + + def cfg(label, **kw): + key = ("ftinit_milp", label) + if key not in store: + print(f"[ftinit_milp] {label} ...", flush=True) + r = _run_step0(mm, scores, prep, step, **kw) + r.label = label + store[key] = r + save() + return store[key] + + # mip_gap sweep (big_m=100, force_on=0.1) + res = [cfg(f"gap={g}", mip_gap=g, big_m=100.0, force_on=0.1, time_limit=900) for g in MIP_GAPS] + _finalize(res) + doc += _print_table("ftINIT step-0: mip_gap (big_m=100, force_on=0.1)", res) + + # big_m sweep (gap=0.001, force_on=0.1) + res = [cfg(f"big_m={int(b)}", mip_gap=0.001, big_m=b, force_on=0.1, time_limit=900) for b in BIG_MS] + _finalize(res) + doc += _print_table("ftINIT step-0: big_m (gap=0.001, force_on=0.1)", res, + "big_m caps a scored reaction's flux; large values weaken the LP relaxation.") + + # force_on sweep (gap=0.001, big_m=100) — changes the model (connectivity threshold) + res = [cfg(f"force_on={fo}", mip_gap=0.001, big_m=100.0, force_on=fo, time_limit=900) for fo in FORCE_ONS] + _finalize(res) + doc += _print_table("ftINIT step-0: force_on (gap=0.001, big_m=100)", res, + "force_on changes the *model* (min flux to count as 'on'), not just tolerance — " + "Jaccard here measures sensitivity, not error.") + return doc + + +def sweep_prep_scale(ref, spont, custom, rxn_scores, store, save) -> list: + doc: list[str] = [] + # One unscaled prep; rescale copies of its min_model for each setting. + base = prep_init_model(ref, ext_comp="e", spontaneous=spont, custom=custom, scale=False) + step, scores = _step0(base, rxn_scores) + + def cfg(label, msd): + key = ("prep_scale", label) + if key not in store: + print(f"[prep_scale] {label} ...", flush=True) + mm = base.min_model.copy() + if msd is not None: + rescale_for_init(mm, msd) + # group_rxn_scores keys are merged ids — identical across copies, so reuse `scores`. + t = time.time() + try: + r = _run_step0(mm, scores, base, step, mip_gap=0.001, big_m=100.0, + force_on=0.1, time_limit=600) + except Exception as ex: # noqa: BLE001 (infeasible/intractable is a finding) + r = Result(label=label, seconds=time.time() - t, status=f"FAIL:{type(ex).__name__}", + objective=0.0, n_kept=0) + r.label = label + store[key] = r + save() + return store[key] + + res = [cfg("scale=on,msd=25", 25.0)] # reference = production default + res += [cfg(f"msd={int(m)}", m) for m in MAX_STOICH if m != 25.0] + res.append(cfg("scale=off", None)) + _finalize(res) + doc += _print_table("prep scaling: rescaleModelForINIT max_stoich_diff (+scale off), big_m=100", res, + "With big_m=100 fixed, scale=off / poor conditioning is expected to be " + "infeasible or far slower — that is the reason scaling is on by default.") + return doc + + +def sweep_tinit(ref, rxn_scores, store, save) -> list: + doc: list[str] = [] + ess: list[str] = [] + + def cfg(label, **kw): + key = ("tinit", label) + if key not in store: + print(f"[tinit] {label} ...", flush=True) + t = time.time() + try: + out = get_init_model(ref, rxn_scores=rxn_scores, essential_rxns=ess, **kw) + r = Result(label=label, seconds=time.time() - t, status="ok", + objective=0.0, n_kept=len(out.model.reactions), + reactions=sorted(x.id for x in out.model.reactions)) + except Exception as ex: # noqa: BLE001 + r = Result(label=label, seconds=time.time() - t, status=f"FAIL:{type(ex).__name__}", + objective=0.0, n_kept=0) + store[key] = r + save() + return store[key] + + tl = 400 # tight time limit so the sweep is affordable + res = [cfg(f"gap={g}", eps=1.0, prod_weight=0.5, mip_gap=g, time_limit=tl) for g in (0.001, 0.003, 0.01)] + _finalize(res) + doc += _print_table(f"tINIT: mip_gap (eps=1, prod_weight=0.5, time_limit={tl}s)", res) + + res = [cfg(f"eps={e}", eps=e, prod_weight=0.5, mip_gap=0.005, time_limit=tl) for e in EPS_VALS] + _finalize(res) + doc += _print_table("tINIT: eps (gap=0.005) — connectivity flux threshold (changes the model)", res) + + res = [cfg(f"prodw={p}", eps=1.0, prod_weight=p, mip_gap=0.005, time_limit=tl) for p in PROD_WEIGHTS] + _finalize(res) + doc += _print_table("tINIT: prod_weight (gap=0.005) — metabolite-production reward (changes the model)", res) + + res = [cfg("big_m=ub(None)", eps=1.0, prod_weight=0.5, mip_gap=0.005, time_limit=tl, big_m=None)] + res += [cfg(f"big_m={int(b)}", eps=1.0, prod_weight=0.5, mip_gap=0.005, time_limit=tl, big_m=b) + for b in (1000.0, 250.0, 100.0)] + _finalize(res) + doc += _print_table("tINIT: big_m (gap=0.005) — None=per-reaction ub (no rescale on tINIT)", res) + return doc + + +def sweep_ftinit_full(prep, rxn_scores, gene_scores, store, save) -> list: + doc: list[str] = [] + + def cfg(label, **kw): + key = ("ftinit_full", label) + if key not in store: + print(f"[ftinit_full] {label} ...", flush=True) + t = time.time() + try: + out = ftinit(prep, rxn_scores, gene_scores=gene_scores, series="1+1", **kw) + r = Result(label=label, seconds=time.time() - t, status="ok", + objective=0.0, n_kept=len(out.reactions), + reactions=sorted(x.id for x in out.reactions)) + except Exception as ex: # noqa: BLE001 + r = Result(label=label, seconds=time.time() - t, status=f"FAIL:{type(ex).__name__}", + objective=0.0, n_kept=0) + store[key] = r + save() + return store[key] + + res = [cfg(f"gap={g}", mip_gap=g, time_limit=600) for g in (0.001, 0.003, 0.01)] + res += [cfg(f"big_m={int(b)}", mip_gap=0.003, big_m=b, time_limit=600) for b in (50.0, 250.0)] + _finalize(res) + doc += _print_table("ftINIT full pipeline ('1+1'): mip_gap & big_m — final model size/stability", res) + return doc + + +def main() -> None: + ap = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument("--work", type=Path, default=Path.home() / "hgem_compare") + ap.add_argument("--human-gem", type=Path, default=Path.home() / "github" / "Human-GEM") + ap.add_argument("--cell", default="HCT116") + ap.add_argument("--sweeps", default="ftinit_milp,prep_scale,tinit,ftinit_full", + help="comma-separated subset of: ftinit_milp,prep_scale,tinit,ftinit_full") + ap.add_argument("--out", type=Path, default=None, help="results pickle (resumable)") + ap.add_argument("--doc", type=Path, default=None, help="write the markdown tables here") + args = ap.parse_args() + + out = args.out or args.work / f"init_param_sweep_{args.cell}.pkl" + store: dict = pickle.load(open(out, "rb")) if out.exists() else {} + + def save(): + tmp = Path(f"{out}.part") + pickle.dump(store, open(tmp, "wb")) + tmp.replace(out) + + sweeps = set(args.sweeps.split(",")) + t0 = time.time() + ref, spont, custom, gene_scores, rxn_scores = _load_inputs(args.work, args.human_gem, args.cell) + print(f"[{time.time()-t0:.0f}s] loaded {len(ref.reactions)} rxns, cell={args.cell}", flush=True) + + prep = None + if sweeps & {"ftinit_milp", "ftinit_full"}: + prep = prep_init_model(ref, ext_comp="e", spontaneous=spont, custom=custom, scale=True) + print(f"[{time.time()-t0:.0f}s] scaled prep: min_model {len(prep.min_model.reactions)} rxns", + flush=True) + + doc: list[str] = [f"# (f)tINIT parameter calibration — Human-GEM / {args.cell}", "", + "Generated by `scripts/analyze_init_params.py`. Reference (first) row of each " + "tolerance sweep is the tightest setting; gaps/Jaccard are measured against it.", ""] + if "ftinit_milp" in sweeps: + doc += sweep_ftinit_milp(prep, rxn_scores, store, save) + if "prep_scale" in sweeps: + doc += sweep_prep_scale(ref, spont, custom, rxn_scores, store, save) + if "tinit" in sweeps: + doc += sweep_tinit(ref, rxn_scores, store, save) + if "ftinit_full" in sweeps: + doc += sweep_ftinit_full(prep, rxn_scores, gene_scores, store, save) + + if args.doc: + args.doc.write_text("\n".join(doc) + "\n") + print(f"\nwrote {args.doc}", flush=True) + + +if __name__ == "__main__": + main() diff --git a/scripts/analyze_init_robustness.py b/scripts/analyze_init_robustness.py new file mode 100644 index 0000000..1cac79a --- /dev/null +++ b/scripts/analyze_init_robustness.py @@ -0,0 +1,268 @@ +#!/usr/bin/env python3 +"""Robustness of (f)tINIT to degraded transcriptomics input (Phase 4d.7). + +The metabolic-task layer is *always part of the pipeline* — it is what makes the output a +functional model. The experimental variable here is therefore the **transcriptomics +input**, not whether tasks are used. This script holds the task + gap-fill layer fixed and +asks: as the expression data gets noisier or sparser, (a) does the model stay functional, +and (b) how much does the *reaction content* drift from what clean data would give — and +which parameters keep it stable? + +Metrics, per run (tasks always on): + +* ``frac`` — fraction of essential metabolic tasks the model performs (``check_tasks``). + The task+gap-fill layer should hold this at 1.0; a drop is a real failure. +* ``Jaccard`` — reaction-set overlap with the **clean-data** model. This is the real cost + of bad input: even when all tasks still pass, degraded data changes *which* + reactions are kept. The primary robustness signal. +* ``n_rxns`` — model size (does degraded data bloat or shrink it). + +Three independent degradations of the gene-expression vector (severity = higher is worse): + +* ``dropout`` — set a random fraction of genes to 0 (→ gene score -5, a strong *remove* + signal). Simulates shallow sequencing / single-cell dropout. +* ``noise`` — multiply each level by ``exp(N(0, sigma))`` (sigma = severity). +* ``downsample`` — drop a random fraction of genes entirely (→ ``no_gene_score``). + +Two phases: + +* **gradient** — task pipeline across degradation levels; shows functional integrity and + reaction-set drift vs the clean-data model. +* **levers** — at a fixed severe degradation, vary the robustness parameters + (``no_gene_score``, ``force_on``; ``prod_weight``/``eps`` for tINIT) to see which keeps + the model closest to the clean-data result / most functional. + +``--algo ftinit`` (default) or ``tinit``. Resumable; reuses the cached Human-GEM task prep +(``rg_prep_tasks.pkl``). Loose MIP gap for speed (functionality + set overlap, not the +exact optimum, are the metrics). + +Usage +----- + python scripts/analyze_init_robustness.py --algo ftinit --cell HCT116 +""" +from __future__ import annotations + +import argparse +import pickle +import time +from dataclasses import dataclass, field +from pathlib import Path + +import cobra +import numpy as np + +from raven_python.init import ( + ftinit, + gene_scores_from_expression, + get_init_model, + score_reactions_from_genes, +) +from raven_python.tasks import check_tasks, parse_task_list + +# Degradation grid (severity per kind). A mild and a severe point per kind. +GRADIENT = { + "dropout": (0.5, 0.7), # moderate + severe-but-realistic (single-cell dropout ~50-70%); + "noise": (1.0, 2.0), # 90%+ dropout breaks ~all tasks so gap-fill rebuilds the model + "downsample": (0.5, 0.7), # (a per-task MILP each) — pathologically slow and unrealistic. +} +LEVER_KIND, LEVER_LEVEL = "dropout", 0.7 # severe-but-tractable point for the levers +NO_GENE_SCORES = (-1.0, -0.5) # vs the default -2 (the gradient row) +FORCE_ONS = (0.2,) # vs the default 0.1 +PROD_WEIGHTS = (0.0, 1.0, 2.0) # tINIT only (default 0.5) +EPS_VALS = (0.5, 1.0) # tINIT only (gradient default 0.1; test higher) + +# Loose solver tolerances (speed; functionality + set overlap, not the exact optimum). +MIP_GAP, TIME_LIMIT = 0.02, 120.0 + + +@dataclass +class Result: + label: str + seconds: float + status: str + n_rxns: int + n_pass: int + n_tasks: int + frac_pass: float + reactions: list[str] = field(default_factory=list) + jaccard_clean: float | None = None + + +def _jaccard(a: set[str], b: set[str]) -> float: + return len(a & b) / len(a | b) if (a or b) else 1.0 + + +def degrade(expr: dict[str, float], kind: str, level: float, seed: int) -> dict[str, float]: + """Return a degraded copy of the expression dict (severity ``level``).""" + if level <= 0: + return dict(expr) + rng = np.random.default_rng(seed) + genes = list(expr) + if kind == "dropout": + out = dict(expr) + for g in rng.choice(genes, size=int(level * len(genes)), replace=False): + out[g] = 0.0 + return out + if kind == "noise": + return {g: max(v * float(np.exp(rng.normal(0.0, level))), 0.0) for g, v in expr.items()} + if kind == "downsample": + keep = set(rng.choice(genes, size=int((1 - level) * len(genes)), replace=False)) + return {g: v for g, v in expr.items() if g in keep} + raise ValueError(f"unknown degradation kind {kind!r}") + + +def functionality(model: cobra.Model, tasks) -> tuple[int, int]: + """(passed, total) essential tasks the extracted model can perform.""" + results = check_tasks(model, tasks) + return sum(t.passed for t in results), len(results) + + +def _measure(label, builder, tasks, clean_set=None) -> Result: + t = time.time() + try: + model = builder() + n_pass, n_tasks = functionality(model, tasks) + rset = sorted(x.id for x in model.reactions) + r = Result(label, time.time() - t, "ok", len(rset), n_pass, n_tasks, + n_pass / n_tasks if n_tasks else 0.0, rset) + if clean_set is not None: + r.jaccard_clean = _jaccard(set(rset), clean_set) + except Exception as ex: # noqa: BLE001 (infeasible/failed build is itself a finding) + msg = str(ex)[:80].replace("\n", " ") or type(ex).__name__ + print(f" FAIL {label}: {type(ex).__name__}: {ex}", flush=True) + r = Result(label, time.time() - t, f"FAIL:{msg}", 0, 0, len(tasks), 0.0) + return r + + +def _table(title, results, note="") -> list[str]: + lines = [f"### {title}", ""] + if note: + lines += [note, ""] + lines.append("| config | time (s) | status | n_rxns | tasks passed | frac | Jaccard vs clean |") + lines.append("|---|--:|---|--:|--:|--:|--:|") + for r in results: + jac = f"{r.jaccard_clean:.3f}" if r.jaccard_clean is not None else "ref" + lines.append(f"| {r.label} | {r.seconds:.0f} | {r.status} | {r.n_rxns} | " + f"{r.n_pass}/{r.n_tasks} | {r.frac_pass:.3f} | {jac} |") + lines.append("") + for ln in lines: + print(ln) + return lines + + +def main() -> None: + ap = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument("--work", type=Path, default=Path.home() / "hgem_compare") + ap.add_argument("--human-gem", type=Path, default=Path.home() / "github" / "Human-GEM") + ap.add_argument("--cell", default="HCT116") + ap.add_argument("--algo", choices=("ftinit", "tinit"), default="ftinit") + ap.add_argument("--phase", default="gradient,levers") + ap.add_argument("--seed", type=int, default=0) + ap.add_argument("--out", type=Path, default=None) + ap.add_argument("--doc", type=Path, default=None) + args = ap.parse_args() + + out = args.out or args.work / f"init_robustness_{args.algo}_{args.cell}.pkl" + store: dict = pickle.load(open(out, "rb")) if out.exists() else {} + + def save(): + tmp = Path(f"{out}.part") + pickle.dump(store, open(tmp, "wb")) + tmp.replace(out) + + def cached(key, fn): + if key not in store: + print(f"[{args.algo}] {key[1]} ...", flush=True) + store[key] = fn() + save() + return store[key] + + t0 = time.time() + ref = cobra.io.read_sbml_model(str(args.work / "raven_refModel.xml")) + ref.solver = cobra.Configuration().solver + expr: dict[str, float] = {} + with open(args.human_gem / "data" / "datasets" / "Hart2015_RNAseq.txt") as f: + h = f.readline().rstrip("\n").split("\t") + c = h.index(args.cell) + for line in f: + p = line.rstrip("\n").split("\t") + expr[p[0]] = float(p[c]) + tasks = parse_task_list(str(args.human_gem / "data" / "metabolicTasks" / + "metabolicTasks_Essential.txt")) + prep = pickle.load(open(args.work / "rg_prep_tasks.pkl", "rb")) # ftINIT uses task layer + task_layer_note = ("task layer always on" if args.algo == "ftinit" + else "essential_rxns=[] (tINIT lb=eps incompatible with many essentials)") + print(f"[{time.time()-t0:.0f}s] ref {len(ref.reactions)} rxns, {len(tasks)} tasks, " + f"cell={args.cell}, algo={args.algo} ({task_layer_note})", flush=True) + + def model_for(e, **kw): + g = gene_scores_from_expression(e, 1.0) + r = score_reactions_from_genes(ref, g, no_gene_score=kw.get("no_gene_score", -2.0)) + if args.algo == "ftinit": + return ftinit(prep, r, gene_scores=g, series="1+1", + force_on=kw.get("force_on", 0.1), mip_gap=MIP_GAP, time_limit=TIME_LIMIT) + # tINIT's essential_rxns are forced via lb=eps; >100 essentials simultaneously is + # infeasible at genome scale regardless of eps (see docs/init_param_calibration.md + # §1.5). tINIT is therefore run *without* essentials here — the realistic + # tINIT-without-gap-fill picture. Use a small default eps (0.1) all the same to + # avoid the unrelated connectivity-threshold over-constraint. + return get_init_model(ref, rxn_scores=r, essential_rxns=[], + prod_weight=kw.get("prod_weight", 0.5), eps=kw.get("eps", 0.1), + mip_gap=MIP_GAP, time_limit=TIME_LIMIT).model + + phases = set(args.phase.split(",")) + doc = [f"# (f)tINIT robustness to degraded transcriptomics — Human-GEM / {args.cell} / {args.algo}", + "", "Task + gap-fill layer is always on (it is part of the pipeline); the variable is the " + "expression input. Functional = fraction of essential tasks performed (check_tasks); " + "Jaccard is reaction-set overlap with the clean-data model. Generated by " + "`scripts/analyze_init_robustness.py`.", ""] + + clean = cached(("clean", "clean"), lambda: _measure("clean", lambda: model_for(expr), tasks)) + clean_set = set(clean.reactions) + clean.jaccard_clean = None # it is the reference + doc += _table("Clean-data baseline", [clean]) + + if "gradient" in phases: + for kind, levels in GRADIENT.items(): + rows = [clean] + for lvl in levels: + e = degrade(expr, kind, lvl, args.seed) + rows.append(cached((f"grad_{kind}", f"{kind}={lvl}"), lambda e=e, lvl=lvl, kind=kind: + _measure(f"{kind}={lvl}", lambda: model_for(e), tasks, clean_set))) + doc += _table(f"Gradient: {kind} (task pipeline always on)", rows, + "Higher severity = noisier/sparser input. frac should stay ~1.0 (the task " + "layer's job); the Jaccard drop is how much degraded data changes the model.") + + if "levers" in phases: + e = degrade(expr, LEVER_KIND, LEVER_LEVEL, args.seed) + tag = f"{LEVER_KIND}={LEVER_LEVEL}" + rows = [] + if args.algo == "ftinit": + for ngs in NO_GENE_SCORES: + rows.append(cached(("lever", f"no_gene_score={ngs}"), lambda ngs=ngs: + _measure(f"no_gene_score={ngs}", lambda: model_for(e, no_gene_score=ngs), + tasks, clean_set))) + for fo in FORCE_ONS: + rows.append(cached(("lever", f"force_on={fo}"), lambda fo=fo: + _measure(f"force_on={fo}", lambda: model_for(e, force_on=fo), + tasks, clean_set))) + else: + for pw in PROD_WEIGHTS: + rows.append(cached(("lever", f"prod_weight={pw}"), lambda pw=pw: + _measure(f"prod_weight={pw}", lambda: model_for(e, prod_weight=pw), + tasks, clean_set))) + for ev in EPS_VALS: + rows.append(cached(("lever", f"eps={ev}"), lambda ev=ev: + _measure(f"eps={ev}", lambda: model_for(e, eps=ev), tasks, clean_set))) + doc += _table(f"Levers at {tag}: which parameter keeps the model closest to clean?", rows, + "Compare against the default-parameter row for this severity in the gradient " + "table above (no_gene_score=-2, force_on=0.1 / prod_weight=0.5, eps=1.0).") + + if args.doc: + args.doc.write_text("\n".join(doc) + "\n") + print(f"\nwrote {args.doc}", flush=True) + + +if __name__ == "__main__": + main() diff --git a/scripts/analyze_init_solvers.py b/scripts/analyze_init_solvers.py new file mode 100644 index 0000000..e1e7b6c --- /dev/null +++ b/scripts/analyze_init_solvers.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 +"""Cross-solver benchmark for ftINIT on a genome-scale model (Phase 4d.7). + +The clean-data calibration and robustness studies tuned (and ran) on Gurobi. The CI +``tests/test_init_solvers.py`` checks correctness on toy models for every installed MILP +solver; this script measures **genome-scale tractability and reaction-set agreement** — +does the same ftINIT pipeline that works in seconds on Gurobi also complete on HiGHS or +GLPK, in what time, and producing the same model? + +For each installed MILP-capable optlang interface (Gurobi, ``hybrid`` for HiGHS, GLPK) it +runs the *same* ftINIT call (cached Human-GEM no-task prep + HCT116 scores) with the same +``mip_gap``/``time_limit``, records (status, wall time, reaction set), and computes the +pairwise Jaccard of the resulting reaction sets. Solvers that fail (the optlang +``hybrid_interface`` ``clone`` bug, or GLPK timing out at genome scale) are recorded as +such — that *is* the cross-solver picture. + +Usage +----- + python scripts/analyze_init_solvers.py --cell HCT116 --time-limit 900 \ + --doc docs/init_solver_benchmark.md +""" +from __future__ import annotations + +import argparse +import importlib.util +import pickle +import time +from dataclasses import dataclass, field +from pathlib import Path + +import cobra + +from raven_python.init import ftinit, gene_scores_from_expression, score_reactions_from_genes + +_INTERFACES = {"gurobi": "gurobi_interface", "hybrid": "hybrid_interface", "glpk": "glpk_interface"} + + +def _available_solvers() -> list[str]: + return [name for name, mod in _INTERFACES.items() + if importlib.util.find_spec(f"optlang.{mod}") is not None] + + +@dataclass +class Result: + solver: str + seconds: float + status: str + n_rxns: int + reactions: list[str] = field(default_factory=list) + + +def _jaccard(a: set[str], b: set[str]) -> float: + return len(a & b) / len(a | b) if (a or b) else 1.0 + + +def main() -> None: + ap = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument("--work", type=Path, default=Path.home() / "hgem_compare") + ap.add_argument("--human-gem", type=Path, default=Path.home() / "github" / "Human-GEM") + ap.add_argument("--cell", default="HCT116") + ap.add_argument("--mip-gap", type=float, default=0.001) + ap.add_argument("--time-limit", type=float, default=900.0) + ap.add_argument("--out", type=Path, default=None) + ap.add_argument("--doc", type=Path, default=None) + args = ap.parse_args() + + out = args.out or args.work / f"init_solver_bench_{args.cell}.pkl" + store: dict = pickle.load(open(out, "rb")) if out.exists() else {} + + def save(): + tmp = Path(f"{out}.part") + pickle.dump(store, open(tmp, "wb")) + tmp.replace(out) + + expr: dict[str, float] = {} + with open(args.human_gem / "data" / "datasets" / "Hart2015_RNAseq.txt") as f: + h = f.readline().rstrip("\n").split("\t") + c = h.index(args.cell) + for line in f: + p = line.rstrip("\n").split("\t") + expr[p[0]] = float(p[c]) + if not (args.work / "rg_prep.pkl").exists(): + raise SystemExit(f"missing prep at {args.work / 'rg_prep.pkl'} — run the validation first") + + solvers = _available_solvers() + print(f"available MILP solvers: {solvers}", flush=True) + + def run(solver: str) -> Result: + if solver in store: + print(f"[{solver}] cached, skip", flush=True) + return store[solver] + print(f"[{solver}] running ...", flush=True) + t = time.time() + try: + # Fresh ref + prep load per solver so a broken interface (e.g. the optlang + # hybrid_interface clone bug at .solver=) doesn't pollute the next solver's state. + ref = cobra.io.read_sbml_model(str(args.work / "raven_refModel.xml")) + ref.solver = solver + local_prep = pickle.load(open(args.work / "rg_prep.pkl", "rb")) + local_prep.min_model.solver = solver + g = gene_scores_from_expression(expr, 1.0) + r = score_reactions_from_genes(ref, g) + model = ftinit(local_prep, r, gene_scores=g, series="1+1", + mip_gap=args.mip_gap, time_limit=args.time_limit) + rset = sorted(x.id for x in model.reactions) + res = Result(solver, time.time() - t, "ok", len(rset), rset) + except Exception as ex: # noqa: BLE001 - failure mode is the finding + res = Result(solver, time.time() - t, + f"FAIL:{type(ex).__name__}: {str(ex)[:80]}", 0, []) + store[solver] = res + save() + return res + + results: dict[str, Result] = {s: run(s) for s in solvers} + + # Reporting. + lines = [f"# Cross-solver ftINIT benchmark — Human-GEM / {args.cell}", "", + f"Same `ftinit()` call (no-task scaled prep; `mip_gap={args.mip_gap}`, " + f"`time_limit={args.time_limit}s`) run with each installed MILP-capable " + f"optlang interface. Generated by `scripts/analyze_init_solvers.py`.", "", + "## Per-solver result", "", + "| solver | time (s) | status | n_rxns |", + "|--------|---------:|--------|-------:|"] + for s, r in results.items(): + lines.append(f"| {s} | {r.seconds:.0f} | {r.status} | {r.n_rxns} |") + lines.append("") + + ok = {s: r for s, r in results.items() if r.status == "ok" and r.reactions} + if len(ok) >= 2: + lines += ["## Reaction-set agreement (Jaccard)", "", + "| solvers | shared | only A | only B | Jaccard |", + "|---------|-------:|-------:|-------:|--------:|"] + names = sorted(ok) + for i, a in enumerate(names): + for b in names[i + 1:]: + sa, sb = set(ok[a].reactions), set(ok[b].reactions) + lines.append(f"| {a} vs {b} | {len(sa & sb)} | {len(sa - sb)} | " + f"{len(sb - sa)} | {_jaccard(sa, sb):.3f} |") + lines.append("") + + text = "\n".join(lines) + "\n" + print(text) + if args.doc: + args.doc.write_text(text) + print(f"wrote {args.doc}", flush=True) + + +if __name__ == "__main__": + main() diff --git a/scripts/benchmark_localization_yeast.py b/scripts/benchmark_localization_yeast.py new file mode 100644 index 0000000..31c2076 --- /dev/null +++ b/scripts/benchmark_localization_yeast.py @@ -0,0 +1,297 @@ +#!/usr/bin/env python3 +"""Benchmark :func:`raven_python.localization.predict_localization` on yeast-GEM. + +Treats yeast-GEM's curated compartmentalisation as ground truth, flattens the model with +:func:`merge_compartments` to a single compartment (so the algorithm cannot lean on +metabolite-topology evidence), then asks ``predict_localization`` to place every +GPR-annotated reaction back into a compartment given a per-gene score table. + +The reference score table is derived directly from yeast-GEM (each gene scores 1.0 in +the compartments where its reactions actually live). Noise can be added — a configurable +fraction of genes have a random other compartment swapped in as the best score — to see +how the algorithm degrades with imperfect predictor evidence. With ``--scores-csv`` the +reference table is replaced by a real predictor output (WoLF PSORT / DeepLoc / hand-built +``gene_id × compartment`` CSV). + +Outputs a per-noise-level accuracy summary and, optionally, a markdown table to a doc. + +Usage +----- + python scripts/benchmark_localization_yeast.py \\ + --yeast-gem ~/github/pcSecYeastSpecies/Model/yeastGEM.xml \\ + --noise 0,0.1,0.25,0.5 \\ + --doc /tmp/yeast_localization_benchmark.md +""" +from __future__ import annotations + +import argparse +import time +from pathlib import Path + +import cobra +import numpy as np +import pandas as pd + +from raven_python.localization import LocalizationScores, predict_localization +from raven_python.manipulation.compartments import merge_compartments + +# --------------------------------------------------------------------------- inputs + +def build_truth(model: cobra.Model) -> dict[str, str]: + """For each single-compartment GPR-annotated reaction, ``{rxn_id: compartment}``. + + Boundary reactions and multi-compartment transports are excluded — those aren't + placeable by the algorithm and shouldn't enter the benchmark. + """ + truth: dict[str, str] = {} + for r in model.reactions: + if r.boundary or not r.genes: + continue + comps = {m.compartment for m in r.metabolites if m.compartment} + if len(comps) != 1: + continue + truth[r.id] = next(iter(comps)) + return truth + + +def derive_scores_from_model(model: cobra.Model) -> LocalizationScores: + """Each gene scores 1.0 in every compartment where its reactions actually live. + + For genes shared across compartments (dual-localised in the curation), all of those + compartments get the top score — which is exactly the situation + ``multi_compartment_penalty`` is designed to handle. + """ + rows: dict[str, dict[str, float]] = {} + for g in model.genes: + seen: set[str] = set() + for r in g.reactions: + for m in r.metabolites: + if m.compartment: + seen.add(m.compartment) + if seen: + rows[g.id] = {c: 1.0 for c in seen} + df = pd.DataFrame.from_dict(rows, orient="index").fillna(0.0) + df.index.name = "gene_id" + return LocalizationScores(df) + + +def add_noise(scores: LocalizationScores, fraction: float, seed: int) -> LocalizationScores: + """For ``fraction`` of genes, replace their score row with a single 1.0 in a random + *wrong* compartment (everything else 0). Simulates "predictor is confidently wrong". + """ + if fraction <= 0: + return scores + rng = np.random.default_rng(seed) + df = scores.df.copy() + compartments = list(df.columns) + n_to_noise = int(round(fraction * len(df))) + targets = rng.choice(df.index, size=n_to_noise, replace=False) + for g in targets: + # find a wrong compartment (any non-top one) to confidently mis-predict + true_top = df.loc[g].idxmax() if df.loc[g].max() > 0 else compartments[0] + candidates = [c for c in compartments if c != true_top] + wrong = rng.choice(candidates) + df.loc[g, :] = 0.0 + df.at[g, wrong] = 1.0 + return LocalizationScores(df) + + +def load_csv_scores(path: Path) -> LocalizationScores: + """Load a ``gene_id × compartment`` CSV (first column = gene_id).""" + df = pd.read_csv(path, index_col=0) + df.index.name = "gene_id" + df = df.apply(pd.to_numeric, errors="coerce").fillna(0.0) + return LocalizationScores(df) + + +# --------------------------------------------------------------------------- benchmark + +def run_one_test( + model_orig: cobra.Model, + truth: dict[str, str], + scores: LocalizationScores, + *, + default_compartment: str, + transport_cost: float, + multi_compartment_penalty: float, + mip_gap: float | None, + time_limit: float | None, +) -> dict: + """One MILP solve + accuracy summary. + + Flattens the model to a single compartment (using the curated default as the merged + id, so reactions truly *in* the default appear unmoved when correctly predicted), + runs ``predict_localization`` on every truth-set reaction, and returns metrics + + per-reaction predictions. + """ + flat, _, _ = merge_compartments( + model_orig, merged_id=default_compartment, merged_name=default_compartment, + drop_single_metabolite_reactions=False, deduplicate_reactions=False, + ) + # The flattened model may have lost some reactions if their net stoichiometry + # cancelled after the merge — restrict the truth set to surviving reactions. + surviving = {r.id for r in flat.reactions} + relevant = {rid: c for rid, c in truth.items() if rid in surviving} + + t = time.time() + proposal = predict_localization( + flat, scores, list(relevant), + default_compartment=default_compartment, + transport_cost=transport_cost, + multi_compartment_penalty=multi_compartment_penalty, + apply=False, mip_gap=mip_gap, time_limit=time_limit, + ) + elapsed = time.time() - t + + # `moved` only lists reactions whose chosen compartment differs from the flattened + # `from_compartment` (i.e. `default_compartment`). Anything not in `moved` was + # placed in the default — record it as such. + moved_to = dict(zip(proposal.moved["rxn_id"], proposal.moved["to_compartment"], strict=True)) + predictions = {rid: moved_to.get(rid, default_compartment) for rid in relevant} + + correct = sum(predictions[rid] == c for rid, c in relevant.items()) + unplaced = set(proposal.unplaced_reactions) & set(relevant) + return { + "seconds": elapsed, + "n_total": len(relevant), + "n_correct": correct, + "n_unplaced": len(unplaced), + "accuracy": correct / len(relevant) if relevant else 0.0, + "predictions": predictions, + "truth": relevant, + } + + +def confusion_matrix(predictions: dict[str, str], truth: dict[str, str]) -> pd.DataFrame: + """Tidy `true × predicted` count matrix.""" + rows = pd.DataFrame({ + "true": [truth[r] for r in predictions], + "predicted": list(predictions.values()), + }) + cm = rows.groupby(["true", "predicted"]).size().unstack(fill_value=0) + return cm.sort_index().sort_index(axis=1) + + +# --------------------------------------------------------------------------- main + +def main() -> None: + ap = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument("--yeast-gem", type=Path, + default=Path.home() / "github" / "pcSecYeastSpecies" / "Model" / "yeastGEM.xml") + ap.add_argument("--scores-csv", type=Path, + help="optional gene_id × compartment CSV; defaults to from-model scores") + ap.add_argument("--noise", default="0,0.1,0.25,0.5", + help="comma-separated noise fractions to sweep (ignored with --scores-csv)") + ap.add_argument("--default-compartment", default="c") + ap.add_argument("--transport-cost", type=float, default=0.5) + ap.add_argument("--multi-compartment-penalty", type=float, default=0.5) + ap.add_argument("--mip-gap", type=float, default=0.01) + ap.add_argument("--time-limit", type=float, default=900) + ap.add_argument("--seed", type=int, default=42) + ap.add_argument("--max-reactions", type=int, default=None, + help="optionally subsample the truth set to N reactions (keeps the " + "compartment distribution, drawn deterministically with --seed)") + ap.add_argument("--doc", type=Path, help="write a markdown summary here") + args = ap.parse_args() + + print(f"loading {args.yeast_gem} ...", flush=True) + model = cobra.io.read_sbml_model(str(args.yeast_gem)) + truth = build_truth(model) + print(f"yeast-GEM: {len(model.reactions)} reactions, {len(model.genes)} genes, " + f"{len(model.compartments)} compartments; truth set: {len(truth)} reactions", + flush=True) + if args.max_reactions and args.max_reactions < len(truth): + # Stratified subsample: keep the original compartment distribution. + rng = np.random.default_rng(args.seed) + by_comp: dict[str, list[str]] = {} + for rid, c in truth.items(): + by_comp.setdefault(c, []).append(rid) + keep: list[str] = [] + for rids in by_comp.values(): + n = max(1, round(args.max_reactions * len(rids) / len(truth))) + keep += list(rng.choice(rids, size=min(n, len(rids)), replace=False)) + truth = {rid: truth[rid] for rid in keep} + print(f"subsampled truth set to {len(truth)} reactions " + f"(--max-reactions={args.max_reactions})", flush=True) + + base_scores: LocalizationScores + if args.scores_csv: + print(f"loading scores from {args.scores_csv} ...", flush=True) + base_scores = load_csv_scores(args.scores_csv) + noise_levels = [0.0] # external scores: no synthetic noise sweep + else: + print("deriving reference scores from yeast-GEM ...", flush=True) + base_scores = derive_scores_from_model(model) + noise_levels = [float(x) for x in args.noise.split(",")] + + results: list[dict] = [] + for noise in noise_levels: + scores = add_noise(base_scores, noise, args.seed) if noise > 0 else base_scores + print(f"\n=== noise={noise:.2f} ({int(noise * len(base_scores.df))} genes " + f"confidently mis-scored) ===", flush=True) + r = run_one_test( + model, truth, scores, + default_compartment=args.default_compartment, + transport_cost=args.transport_cost, + multi_compartment_penalty=args.multi_compartment_penalty, + mip_gap=args.mip_gap, time_limit=args.time_limit, + ) + r["noise"] = noise + results.append(r) + print(f" solved in {r['seconds']:.0f}s — accuracy {r['n_correct']}/{r['n_total']} = " + f"{r['accuracy']:.3f} ({r['n_unplaced']} unplaced)", flush=True) + + # --- Reporting ------------------------------------------------------------- + lines: list[str] = [] + lines += ["# yeast-GEM localisation benchmark", "", + f"Model: `{args.yeast_gem.name}` — {len(model.reactions)} reactions, " + f"{len(model.genes)} genes, {len(model.compartments)} compartments. " + f"Truth set: {len(truth)} single-compartment GPR-annotated reactions. " + f"Default compartment for the merged model: `{args.default_compartment}`. " + f"`transport_cost={args.transport_cost}`, " + f"`multi_compartment_penalty={args.multi_compartment_penalty}`, " + f"`mip_gap={args.mip_gap}`, `time_limit={args.time_limit}s`.", "", + "## Accuracy vs. predictor noise", "", + "| noise | seconds | n_total | n_correct | n_unplaced | accuracy |", + "|------:|--------:|--------:|----------:|-----------:|---------:|"] + for r in results: + lines.append( + f"| {r['noise']:.2f} | {r['seconds']:.0f} | {r['n_total']} | " + f"{r['n_correct']} | {r['n_unplaced']} | {r['accuracy']:.3f} |" + ) + lines.append("") + + # Confusion matrix for the lowest-noise run (typically the most informative). + best = min(results, key=lambda x: x["noise"]) + cm = confusion_matrix(best["predictions"], best["truth"]) + lines += [f"## Confusion matrix at noise={best['noise']:.2f}", "", + "Rows = curated (true) compartment; columns = predicted.", ""] + lines.append("| true \\ pred | " + " | ".join(str(c) for c in cm.columns) + " |") + lines.append("|---" + "|---" * len(cm.columns) + "|") + for true_c, row in cm.iterrows(): + lines.append(f"| **{true_c}** | " + " | ".join(str(int(v)) for v in row) + " |") + lines.append("") + + # Per-compartment accuracy at the lowest-noise run. + per_comp: dict[str, tuple[int, int]] = {} + for rid, true_c in best["truth"].items(): + n_true, n_correct = per_comp.get(true_c, (0, 0)) + per_comp[true_c] = (n_true + 1, n_correct + (best["predictions"][rid] == true_c)) + lines += [f"## Per-compartment accuracy at noise={best['noise']:.2f}", "", + "| compartment | n | n_correct | accuracy |", + "|---|--:|--:|--:|"] + for c in sorted(per_comp): + n, ok = per_comp[c] + lines.append(f"| {c} | {n} | {ok} | {ok / n:.3f} |") + lines.append("") + + text = "\n".join(lines) + "\n" + print("\n" + text) + if args.doc: + args.doc.write_text(text) + print(f"wrote {args.doc}", flush=True) + + +if __name__ == "__main__": + main() diff --git a/scripts/build_kegg_artefacts.py b/scripts/build_kegg_artefacts.py new file mode 100644 index 0000000..13fd00e --- /dev/null +++ b/scripts/build_kegg_artefacts.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python +"""Build the publishable KEGG artefact set for one release (maintainer-side). + +Runs the maintainer pipeline against an arranged KEGG dump (see +``download_kegg_dump`` / ``fetch_keggdb``): + +* 3b.2 — ``parse_kegg_dump`` → ``reference_model.yml.gz`` + the gzipped-TSV tables; +* 3b.3 — ``build_hmm_library`` per domain → a pressed ``.hmm`` (+ hmmpress + sidecars), named so :func:`raven_python.data.ensure_kegg_hmm_library` can fetch them. + +Everything lands in ``--out`` ready to upload as release assets; feed that +directory to ``scripts/make_registry_snippet.py data`` to emit the registry entry. + +Examples +-------- +Tables + reference model only (fast, no binaries):: + + python scripts/build_kegg_artefacts.py --keggdb keggdb --out artefacts + +Full build incl. both HMM libraries (slow; needs HMMER/MAFFT/CD-HIT):: + + python scripts/build_kegg_artefacts.py --keggdb keggdb --out artefacts \\ + --hmms --threads 8 +""" +from __future__ import annotations + +import argparse +import shutil +from pathlib import Path + +from raven_python.reconstruction.kegg import ( + build_hmm_library, + parse_kegg_dump, + read_kegg_table, +) + +# hmmpress sidecar extensions, alongside the .hmm. +_HMM_SIDECARS = (".h3f", ".h3i", ".h3m", ".h3p") + + +def _publish_library(work: dict, out_dir: Path, domain: str) -> Path: + """Copy a built ``library.hmm`` (+ sidecars) to ``out_dir/.hmm``.""" + library = work["library"] + if library is None: + raise SystemExit(f"No HMMs built for {domain!r}; nothing to publish.") + target = out_dir / f"{domain}.hmm" + shutil.copyfile(library, target) + for suffix in _HMM_SIDECARS: + sidecar = library.with_name(library.name + suffix) + if sidecar.exists(): + shutil.copyfile(sidecar, target.with_name(target.name + suffix)) + return target + + +def main(argv: list[str] | None = None) -> None: + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("--keggdb", required=True, type=Path, help="arranged KEGG dump directory") + parser.add_argument("--out", required=True, type=Path, help="artefact output directory") + parser.add_argument("--hmms", action="store_true", help="also build the HMM libraries") + parser.add_argument( + "--domains", nargs="+", default=["prokaryotes", "eukaryotes"], help="HMM domains to build" + ) + parser.add_argument("--threads", type=int, default=1) + parser.add_argument("--seq-identity", type=float, default=0.9, help="CD-HIT identity (-1 skips)") + parser.add_argument( + "--parttree-residues", type=int, default=None, + help="total-residue budget above which MAFFT uses PartTree (default 1M, tuned " + "for ~7 GB RAM; raise on machines with more memory)", + ) + args = parser.parse_args(argv) + + args.out.mkdir(parents=True, exist_ok=True) + print(">>> Parsing KEGG dump (3b.2)...") + paths = parse_kegg_dump(args.keggdb, args.out) + for name, path in paths.items(): + print(f" {name}: {path}") + + if args.hmms: + ogk = read_kegg_table(paths["organism_gene_ko"]) + genes_pep = args.keggdb / "genes.pep" + taxonomy = args.keggdb / "taxonomy" + for domain in args.domains: + print(f">>> Building HMM library for {domain} (3b.3)...") + work = build_hmm_library( + ogk, genes_pep, taxonomy, args.out / f"_hmms-{domain}", + domain=domain, seq_identity=args.seq_identity, + parttree_residues=args.parttree_residues, threads=args.threads, + ) + published = _publish_library(work, args.out, domain) + print(f" {domain}: {published} ({len(work['hmms'])} profiles)") + + print(f"\n>>> Done. Upload the contents of {args.out} as release assets, then run:") + print(" python scripts/make_registry_snippet.py data --dataset kegg " + f"--version --dir {args.out} --base-url ") + + +if __name__ == "__main__": + main() diff --git a/scripts/make_registry_snippet.py b/scripts/make_registry_snippet.py new file mode 100644 index 0000000..3efa49e --- /dev/null +++ b/scripts/make_registry_snippet.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python +"""Emit ready-to-paste registry entries for published artefacts / binary ZIPs. + +Computes the SHA256 of each file and prints the Python/JSON entry to merge into +``raven_python.data._DATA_REGISTRY`` (data artefacts) or ``raven_python.binaries._REGISTRY`` +(binary bundles). Run once per release, after uploading the files to the release. + +Examples +-------- +Data artefacts (KEGG reference model + tables + HMM libraries) for one release:: + + python scripts/make_registry_snippet.py data \\ + --dataset kegg --version kegg116 --dir artefacts \\ + --base-url https://github.com/ORG/raven_python/releases/download/kegg-data-kegg116 + +Binary bundle (one ZIP per platform, named ``---.zip``):: + + python scripts/make_registry_snippet.py binary \\ + --bundle blast --version 2.16.0 --provides blastp makeblastdb --dir zips \\ + --base-url https://github.com/ORG/raven_python/releases/download/blast-2.16.0 + +The SHA256 helper is shared with the runtime resolvers (``raven_python.binaries``), so +published checksums always match what ``ensure_data`` / ``ensure_binary`` verify. +""" +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +from raven_python.binaries import _sha256 + + +def _files_in(directory: Path) -> list[Path]: + """Regular, non-hidden files in ``directory``, sorted by name.""" + return sorted(p for p in directory.iterdir() if p.is_file() and not p.name.startswith(".")) + + +def data_entry(dataset: str, version: str, base_url: str, directory: Path) -> dict: + """Build the ``_DATA_REGISTRY[dataset]`` entry for every file in ``directory``.""" + base = base_url.rstrip("/") + files = { + p.name: {"url": f"{base}/{p.name}", "sha256": _sha256(p)} for p in _files_in(directory) + } + if not files: + raise SystemExit(f"No files found in {directory}") + return {"version": version, "files": files} + + +def binary_entry( + bundle: str, version: str, provides: list[str], base_url: str, directory: Path +) -> dict: + """Build the ``_REGISTRY[bundle]`` entry from ``---.zip``.""" + base = base_url.rstrip("/") + prefix = f"{bundle}-{version}-" + platforms = {} + for zip_path in directory.glob(f"{prefix}*.zip"): + platform = zip_path.name[len(prefix) : -len(".zip")] + platforms[platform] = {"url": f"{base}/{zip_path.name}", "sha256": _sha256(zip_path)} + if not platforms: + raise SystemExit(f"No {prefix}*.zip files found in {directory}") + return {"version": version, "provides": provides, "platforms": dict(sorted(platforms.items()))} + + +def render(key: str, entry: dict) -> str: + """Render ``{key: entry}`` as an indented JSON block (valid Python to paste).""" + return json.dumps({key: entry}, indent=4) + + +def main(argv: list[str] | None = None) -> None: + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + sub = parser.add_subparsers(dest="kind", required=True) + + d = sub.add_parser("data", help="data-artefact registry entry (raven_python.data)") + d.add_argument("--dataset", required=True, help="dataset key, e.g. 'kegg'") + d.add_argument("--version", required=True) + d.add_argument("--dir", required=True, type=Path, help="directory of uploaded artefacts") + d.add_argument("--base-url", required=True, help="release download URL prefix") + + b = sub.add_parser("binary", help="binary-bundle registry entry (raven_python.binaries)") + b.add_argument("--bundle", required=True, help="bundle key, e.g. 'blast'") + b.add_argument("--version", required=True) + b.add_argument("--provides", nargs="+", required=True, help="executables the bundle provides") + b.add_argument("--dir", required=True, type=Path, help="directory of uploaded ZIPs") + b.add_argument("--base-url", required=True, help="release download URL prefix") + + args = parser.parse_args(argv) + if args.kind == "data": + key, entry = args.dataset, data_entry(args.dataset, args.version, args.base_url, args.dir) + target = "raven_python/data.py _DATA_REGISTRY" + else: + key = args.bundle + entry = binary_entry(args.bundle, args.version, args.provides, args.base_url, args.dir) + target = "raven_python/binaries.py _REGISTRY" + + print(f"# Merge into {target}:", file=sys.stderr) + print(render(key, entry)) + + +if __name__ == "__main__": + main() diff --git a/src/raven_python/__init__.py b/src/raven_python/__init__.py new file mode 100644 index 0000000..591c4c1 --- /dev/null +++ b/src/raven_python/__init__.py @@ -0,0 +1,10 @@ +"""raven_python — Python counterpart of the RAVEN Toolbox, built on cobrapy. + +raven_python reuses cobrapy for simulation, standard analyses, SBML I/O, and model +manipulation, and provides the RAVEN-specific functionality on top: de novo +reconstruction (KEGG / homology), context-specific modeling (tINIT / ftINIT), +metabolic task validation, connectivity gap-filling, omics integration (HPA), +sub-cellular localisation, N-model comparison, and the RAVEN-style I/O formats. +""" + +__version__ = "0.0.1" diff --git a/src/raven_python/analysis/__init__.py b/src/raven_python/analysis/__init__.py new file mode 100644 index 0000000..d85afef --- /dev/null +++ b/src/raven_python/analysis/__init__.py @@ -0,0 +1,23 @@ +"""Analyses not in cobrapy's core. + +* :func:`reporter_metabolites` — Reporter Metabolites (around-metabolite gene-score test). +* :func:`fseof` — Flux Scanning based on Enforced Objective Flux. +* :func:`random_sampling` — random-objective flux sampling. +""" +from raven_python.analysis.fseof import FSEOFResult, fseof +from raven_python.analysis.reporter import ReporterResult, reporter_metabolites +from raven_python.analysis.sampling import ( + RandomSamplingResult, + find_good_reactions, + random_sampling, +) + +__all__ = [ + "FSEOFResult", + "RandomSamplingResult", + "ReporterResult", + "find_good_reactions", + "fseof", + "random_sampling", + "reporter_metabolites", +] diff --git a/src/raven_python/analysis/fseof.py b/src/raven_python/analysis/fseof.py new file mode 100644 index 0000000..c5b3ee2 --- /dev/null +++ b/src/raven_python/analysis/fseof.py @@ -0,0 +1,161 @@ +"""Flux Scanning based on Enforced Objective Flux — FSEOF (port + redesign). + +FSEOF (Choi et al., Appl Environ Microbiol 2010) finds metabolic-engineering targets +for over-producing a metabolite: enforce an increasing flux toward the target product +while optimising growth, and watch how each reaction's flux responds. This is a port +of RAVEN's ``FSEOF`` with a substantially richer, more robust output (RAVEN's +weaknesses are noted in IMPROVEMENTS, FS1–FS4): + +* **Robust trend, not strict monotonicity.** Each reaction's flux is regressed against + the enforced product flux across the scan; the **slope** is the response and the + **correlation** (|r|) is a quality score. A reaction is a target if it tracks the + product cleanly (|r| ≥ ``correlation_threshold``) — one noisy step from LP + alternative optima no longer discards it (and pFBA per step keeps the scan stable). +* **Direction classification RAVEN lacks.** Targets are labelled ``amplify`` (|flux| + rises with the product → over-express), ``knockdown`` (|flux| falls), or ``knockout`` + (|flux| → ~0 → delete). RAVEN only ever reports the amplification targets. +* **Gene-level view** via :attr:`FSEOFResult.gene_targets`, and the full flux scan is + retained in :attr:`FSEOFResult.scan` — all as DataFrames, not a printed TSV. +""" +from __future__ import annotations + +from dataclasses import dataclass + +import cobra +import numpy as np +import pandas as pd +from cobra.exceptions import OptimizationError +from cobra.flux_analysis import pfba +from scipy.stats import linregress + + +@dataclass +class FSEOFResult: + """FSEOF output. + + ``scan`` is reactions × enforced-flux-levels (the full flux scan); ``enforced`` are + the enforced target fluxes; ``targets`` is the classified per-reaction table + (sorted by score). :attr:`gene_targets` aggregates targets to genes. + """ + + scan: pd.DataFrame + enforced: list[float] + targets: pd.DataFrame + + @property + def amplification(self) -> pd.DataFrame: + return self.targets[self.targets["target_type"] == "amplify"].reset_index(drop=True) + + @property + def knockout(self) -> pd.DataFrame: + mask = self.targets["target_type"].isin(["knockout", "knockdown"]) + return self.targets[mask].reset_index(drop=True) + + @property + def gene_targets(self) -> pd.DataFrame: + """Per-gene aggregation: the target reactions each gene is associated with.""" + rows = [] + for _, t in self.targets.iterrows(): + for gene in t["genes"]: + rows.append({"gene": gene, "reaction": t["reaction"], + "target_type": t["target_type"], "slope": t["slope"]}) + if not rows: + return pd.DataFrame(columns=["gene", "target_type", "reactions", "max_abs_slope"]) + df = pd.DataFrame(rows) + agg = df.groupby("gene").agg( + target_type=("target_type", lambda s: ";".join(sorted(set(s)))), + reactions=("reaction", lambda s: ";".join(sorted(set(s)))), + max_abs_slope=("slope", lambda s: float(np.max(np.abs(s)))), + ).reset_index() + return agg.sort_values("max_abs_slope", ascending=False, ignore_index=True) + + +def fseof( + model: cobra.Model, + target_rxn: str, + *, + biomass_rxn: str | None = None, + n_steps: int = 10, + max_fraction: float = 0.9, + correlation_threshold: float = 0.9, + flux_eps: float = 1e-6, +) -> FSEOFResult: + """Run FSEOF for over-production of ``target_rxn``'s product. + + Enforces target flux from ``max_fraction/n_steps`` up to ``max_fraction`` of the + theoretical maximum in ``n_steps`` steps, maximising growth (``biomass_rxn`` or the + model's current objective) with pFBA at each step. Returns an :class:`FSEOFResult`. + """ + with model: # find the theoretical maximum target flux + model.objective = target_rxn + target_opt = model.slim_optimize() + # slim_optimize returns NaN on an infeasible model; np.isfinite catches that too. + if target_opt is None or not np.isfinite(target_opt) or target_opt <= flux_eps: + raise ValueError(f"{target_rxn!r} cannot carry positive flux; nothing to scan.") + target_max = target_opt * max_fraction + levels = [target_max * (i + 1) / n_steps for i in range(n_steps)] + + columns: dict[float, pd.Series] = {} + enforced: list[float] = [] + for level in levels: + with model: + if biomass_rxn is not None: + model.objective = biomass_rxn + model.reactions.get_by_id(target_rxn).lower_bound = level + try: + columns[level] = pfba(model).fluxes + except OptimizationError: + break # enforced flux became infeasible — stop scanning + enforced.append(level) + if len(enforced) < 2: + raise RuntimeError("FSEOF needs at least two feasible enforced-flux levels.") + + scan = pd.DataFrame(columns) + targets = _classify(model, scan, np.asarray(enforced), correlation_threshold, flux_eps) + return FSEOFResult(scan=scan, enforced=enforced, targets=targets) + + +def _classify(model, scan, enforced, corr_threshold, flux_eps) -> pd.DataFrame: + rows = [] + for rxn in model.reactions: + flux = scan.loc[rxn.id, enforced.tolist() if hasattr(enforced, "tolist") else enforced] + flux = flux.to_numpy(dtype=float) + initial, final = flux[0], flux[-1] + if flux.std() < flux_eps: # flat -> no response + continue + fit = linregress(enforced, flux) + slope, corr = float(fit.slope), float(fit.rvalue) + if abs(corr) < corr_threshold or abs(slope) < flux_eps: + continue + # Classify on the slope of |flux| vs the enforced product flux — the + # criterion the docstring states (|flux| rises = amplify, etc.). The + # old endpoint-only check (``abs(final) vs abs(initial)``) could + # mislabel a track whose first/last values straddled a peak/trough but + # whose overall trend was the opposite. Keep ``knockout`` for tracks + # the regression drives essentially to zero. + abs_fit = linregress(enforced, np.abs(flux)) + abs_slope = float(abs_fit.slope) + if abs(final) < flux_eps and abs_slope < 0: + ttype = "knockout" + elif abs_slope > 0: + ttype = "amplify" + else: + ttype = "knockdown" + rows.append({ + "reaction": rxn.id, + "name": rxn.name, + "subsystem": rxn.subsystem, + "gene_reaction_rule": rxn.gene_reaction_rule, + "genes": sorted(g.id for g in rxn.genes), + "target_type": ttype, + "slope": slope, + "correlation": corr, + "initial_flux": initial, + "final_flux": final, + "score": abs(slope) * abs(corr), + }) + table = pd.DataFrame(rows, columns=[ + "reaction", "name", "subsystem", "gene_reaction_rule", "genes", + "target_type", "slope", "correlation", "initial_flux", "final_flux", "score", + ]) + return table.sort_values("score", ascending=False, ignore_index=True) diff --git a/src/raven_python/analysis/reporter.py b/src/raven_python/analysis/reporter.py new file mode 100644 index 0000000..5d96d47 --- /dev/null +++ b/src/raven_python/analysis/reporter.py @@ -0,0 +1,117 @@ +"""Reporter Metabolites — metabolites around which transcriptional change concentrates. + +Patil & Nielsen, PNAS 2005. Each gene's differential-expression p-value becomes a +Z-score ``z = -Φ⁻¹(p)``; for every metabolite the Z-scores of the genes on its +neighbouring reactions are aggregated (``Σz / √n``), background-corrected, and turned +back into a p-value. + +The background correction has an exact closed form (sampling with replacement from the +scored-gene pool: a random ``Σz/√n`` has mean ``√n·μ`` and standard deviation ``σ`` +with μ, σ the mean/std of the scored Z-scores), so the corrected score is just +``(metZ − √n·μ) / σ`` — no Monte-Carlo sampling needed. +""" +from __future__ import annotations + +import math +from collections.abc import Mapping +from dataclasses import dataclass + +import cobra +import numpy as np +import pandas as pd +from scipy.stats import norm + +_CLAMP = 15.0 # |Z| cap for p-values of exactly 0 or 1 (RAVEN's ±15) + + +@dataclass +class ReporterResult: + """Reporter-metabolite scores for one gene set. + + ``test`` is ``"all"``, ``"up"`` or ``"down"``; ``table`` is a DataFrame with + columns ``metabolite, name, z_score, p_value, n_genes, mean_z, std_z`` sorted by + descending ``z_score``. + """ + + test: str + table: pd.DataFrame + + +def _gene_z(pvalues: dict[str, float]) -> dict[str, float]: + genes = list(pvalues) + z = -norm.ppf([pvalues[g] for g in genes]) + z = np.where(np.isposinf(z), _CLAMP, z) + z = np.where(np.isneginf(z), -_CLAMP, z) + return dict(zip(genes, z, strict=True)) + + +def _reporter_one(model: cobra.Model, gene_z: dict[str, float], test: str) -> ReporterResult: + z_values = np.fromiter(gene_z.values(), dtype=float) + mu = float(z_values.mean()) if z_values.size else 0.0 + sigma = float(z_values.std(ddof=0)) if z_values.size else 0.0 + + rows = [] + for met in model.metabolites: + neighbours = {g.id for rxn in met.reactions for g in rxn.genes if g.id in gene_z} + if not neighbours: + continue + zs = np.array([gene_z[g] for g in neighbours]) + n = zs.size + raw = zs.sum() / math.sqrt(n) + # Exact background correction for sampling-with-replacement (see module doc). + corrected = (raw - math.sqrt(n) * mu) / sigma if sigma > 0 else 0.0 + rows.append( + { + "metabolite": met.id, + "name": met.name or met.id, + "z_score": corrected, + "p_value": float(1.0 - norm.cdf(corrected)), + "n_genes": n, + "mean_z": float(zs.mean()), + "std_z": float(zs.std(ddof=1)) if n > 1 else float("nan"), + } + ) + table = pd.DataFrame(rows, columns=["metabolite", "name", "z_score", "p_value", "n_genes", "mean_z", "std_z"]) + table = table.sort_values("z_score", ascending=False, ignore_index=True) + return ReporterResult(test, table) + + +def reporter_metabolites( + model: cobra.Model, + gene_pvalues: Mapping[str, float], + *, + gene_fold_changes: Mapping[str, float] | None = None, +) -> list[ReporterResult]: + """Compute Reporter Metabolites from per-gene differential-expression p-values. + + ``gene_pvalues`` maps gene id → p-value (genes not in the model, or with a NaN or + out-of-``[0, 1]`` p-value, are dropped — a stray invalid p-value would otherwise + turn the whole result NaN). If ``gene_fold_changes`` (gene id → log fold change) + is given, two extra results are returned for the up- (fc ≥ 0) and down- (fc < 0) + regulated gene subsets, in addition to ``"all"``. + + Parity with RAVEN's ``reporterMetabolites``: the ``z_score`` and underlying + background correction match exactly (exact closed-form instead of RAVEN's + Monte-Carlo, see IMPROVEMENTS RM1). The reported ``p_value`` is the + *one-sided* (``"up"``) enrichment ``1 - Φ(z)`` and the result is sorted by + ``z_score`` descending. RAVEN sorts by p-value and reports both tails + (``allPValues``, ``allUpPValues``, ``allDownPValues``); the up/down splits + here come from the ``gene_fold_changes`` subset partition instead, so the + same information is available via the three returned ``ReporterResult`` + rows. + """ + model_genes = {g.id for g in model.genes} + scored = { + g: float(p) + for g, p in gene_pvalues.items() + if g in model_genes and p is not None and not math.isnan(p) and 0.0 <= p <= 1.0 + } + gene_z = _gene_z(scored) + results = [_reporter_one(model, gene_z, "all")] + + if gene_fold_changes is not None: + up = {g: z for g, z in gene_z.items() if gene_fold_changes.get(g, 0.0) >= 0} + down = {g: z for g, z in gene_z.items() if gene_fold_changes.get(g, 0.0) < 0} + results.append(_reporter_one(model, up, "up")) + results.append(_reporter_one(model, down, "down")) + return results diff --git a/src/raven_python/analysis/sampling.py b/src/raven_python/analysis/sampling.py new file mode 100644 index 0000000..429b164 --- /dev/null +++ b/src/raven_python/analysis/sampling.py @@ -0,0 +1,207 @@ +"""Random-objective flux sampling — RAVEN's ``randomSampling`` (port + improvements). + +Samples the flux solution space by the **random-objective** method of Bordel et al. +(2010, PLoS Comput Biol, doi:10.1371/journal.pcbi.1000859), as ported from RAVEN's +``randomSampling``: each sample maximises a small random linear combination of +reactions, so every sample is an *extreme point* (vertex) of the flux polytope. + +This is a different statistical object from cobrapy's ``cobra.sampling`` (OptGP / +ACHR), which draw a (near-)uniform Markov-chain sample of the polytope *interior*. +Use cobra's samplers when you need the uniform flux distribution; use this when you +want a fast, robust spread of diverse optimal states — the workflow RAVEN uses to +compare conditions, and one that stays well-behaved on large or tightly-constrained +models where MCMC mixing is poor. cobrapy has no equivalent, so this is a genuine +addition, not a wrapper. + +Improvements over RAVEN (see IMPROVEMENTS SAMP1): + +* **`good_reactions` via one FVA pass**, not a hand-rolled per-reaction ``parfor`` + loop. A reaction is usable as a random objective if it can carry flux and is not + stuck in a stoichiometrically-infeasible loop (its range blows past the arbitrary + large bound). ``cobra``'s FVA computes exactly that, faster and in far less code, + and can optionally be made ``loopless``. +* **Reproducible** via ``seed`` (RAVEN has no seed control). +* **`n_objectives` is a parameter** (RAVEN hard-codes 2, though its docstring claims + 3). +* **Tidy output**: a ``samples`` DataFrame shaped samples × reactions (matching + ``cobra.sampling``), plus the reusable ``good_reactions`` list — instead of a + reactions × samples matrix and a parallel index vector. +""" +from __future__ import annotations + +import logging +from collections.abc import Iterable +from dataclasses import dataclass + +import cobra +import numpy as np +import pandas as pd +from cobra.flux_analysis import flux_variability_analysis, pfba + +logger = logging.getLogger(__name__) + + +@dataclass +class RandomSamplingResult: + """Output of :func:`random_sampling`. + + ``samples`` is a DataFrame of flux vectors shaped *n_samples × n_reactions* + (one sample per row, reaction ids as columns — the ``cobra.sampling`` layout). + ``good_reactions`` is the list of reaction ids that were eligible as random + objectives; pass it back in to skip the (one-off) FVA on a repeat run. + """ + + samples: pd.DataFrame + good_reactions: list[str] + + +def find_good_reactions( + model: cobra.Model, + *, + flux_tol: float = 1e-9, + loopless: bool = True, + exclude_reactions: Iterable[str] | None = None, +) -> list[str]: + """Reactions usable as random objectives: carry real (non-loop) flux. + + A reaction is kept if its FVA range spans more than ``flux_tol``. With + ``loopless`` (default) the FVA is loopless (``cycleFreeFlux``), so reactions + that can carry flux *only* through a thermodynamically-infeasible cycle have a + ~0 loopless range and are dropped — the right test for "loopy", unlike a fixed + bound threshold which wrongly drops legitimate reactions that simply reach the + model's default (e.g. 1000) bound. Pass ``loopless=False`` for a faster, looser + pass that keeps any flux-carrying reaction (loops included). + """ + fva = flux_variability_analysis( + model, fraction_of_optimum=0.0, + loopless="cycleFreeFlux" if loopless else None, + ) + excluded = set(exclude_reactions or ()) + return [ + rxn_id + for rxn_id, lo, hi in zip(fva.index, fva["minimum"], fva["maximum"], strict=True) + if rxn_id not in excluded and max(abs(lo), abs(hi)) > flux_tol + ] + + +def random_sampling( + model: cobra.Model, + n_samples: int = 1000, + *, + n_objectives: int = 2, + good_reactions: Iterable[str] | None = None, + replace_max_bound: bool = False, + min_flux: bool = False, + loopless_good_reactions: bool = True, + exclude_reactions: Iterable[str] | None = None, + max_attempts: int = 100, + suppress_errors: bool = False, + seed: int | None = None, +) -> RandomSamplingResult: + """Random-objective sampling of ``model``'s flux space (Bordel et al. 2010). + + Each sample maximises ``sum(w_i * v_i)`` over ``n_objectives`` reactions drawn at + random from ``good_reactions``, with weights ``w_i = U(0,1) * (±1)`` (a random + sign per reaction, as in RAVEN). The resulting flux vector is one sample. + + Parameters + ---------- + n_samples + Number of flux vectors to return. + n_objectives + Reactions combined into each random objective (RAVEN's fixed 2). + good_reactions + Reaction ids eligible as objectives. If ``None`` they are computed once with + :func:`find_good_reactions` and returned for reuse. + replace_max_bound + RAVEN's ``replaceBoundsWithInf``: replace the largest upper bound with + ``+inf`` (and the smallest negative lower bound with ``-inf``) before + sampling, so a reaction whose biological maximum exceeds the model's + arbitrary cap is not pinned at it. **Off by default** — unlike RAVEN. It + applies only to the sampling phase (``good_reactions`` is always found on + the finite bounds), and it can open unbounded directions through loops + that show up as large fluxes in non-objective reactions; pair it with + ``min_flux`` if you enable it. + min_flux + After maximising the random objective, re-solve parsimoniously + (:func:`cobra.flux_analysis.pfba`) to minimise total flux at that optimum — + squeezes residual loops out of each individual sample. + loopless_good_reactions, exclude_reactions + Forwarded to :func:`find_good_reactions` when it is invoked (loopless loop + detection is on by default). + max_attempts, suppress_errors + A sample is retried if the random objective is degenerate (zero flux). After + ``max_attempts`` failures this raises, unless ``suppress_errors`` (then the + degenerate solution is kept with a warning). + seed + Seed for reproducible objective draws. + + Returns + ------- + RandomSamplingResult + """ + if n_samples <= 0: + raise ValueError("n_samples must be positive.") + rng = np.random.default_rng(seed) + model = model.copy() + + if model.slim_optimize(error_value=None) is None: + raise ValueError( + "The model has no feasible solution, likely due to incompatible constraints." + ) + + # good_reactions must be found on the finite bounds (FVA cannot handle inf), + # before any bound replacement. + if good_reactions is None: + good_reactions = find_good_reactions( + model, loopless=loopless_good_reactions, + exclude_reactions=exclude_reactions, + ) + good_reactions = list(good_reactions) + + if replace_max_bound: + max_ub = max(r.upper_bound for r in model.reactions) + min_lb = min(r.lower_bound for r in model.reactions) + for r in model.reactions: + if r.upper_bound == max_ub: + r.upper_bound = float("inf") + if min_lb < 0 and r.lower_bound == min_lb: + r.lower_bound = float("-inf") + + if len(good_reactions) < n_objectives: + raise ValueError( + f"Only {len(good_reactions)} usable reactions found, need at least " + f"n_objectives={n_objectives}. Check the model's constraints." + ) + + good_rxn_objs = [model.reactions.get_by_id(r) for r in good_reactions] + reaction_ids = [r.id for r in model.reactions] + samples = np.zeros((n_samples, len(reaction_ids))) + + for i in range(n_samples): + for attempt in range(1, max_attempts + 1): + chosen = rng.choice(len(good_rxn_objs), size=n_objectives, replace=False) + signs = rng.choice((-1.0, 1.0), size=n_objectives) + weights = rng.random(n_objectives) * signs + terms = [w * good_rxn_objs[j].flux_expression + for j, w in zip(chosen, weights, strict=True)] + model.objective = model.problem.Objective(sum(terms), direction="max") + sol = model.optimize() + if sol.status == "optimal" and abs(sol.objective_value) > 1e-8: + samples[i, :] = (pfba(model) if min_flux else sol).fluxes.reindex(reaction_ids).to_numpy() + break + if attempt == max_attempts: + if not suppress_errors: + raise RuntimeError( + "Could not find a non-zero, loop-free solution after " + f"{max_attempts} attempts for sample {i}. Review the model's " + "constraints, or set suppress_errors=True." + ) + logger.warning("Sample %d: kept a degenerate solution after %d attempts.", + i, max_attempts) + samples[i, :] = sol.fluxes.reindex(reaction_ids).to_numpy() + + return RandomSamplingResult( + samples=pd.DataFrame(samples, columns=reaction_ids), + good_reactions=good_reactions, + ) diff --git a/src/raven_python/binaries.py b/src/raven_python/binaries.py new file mode 100644 index 0000000..2d4b5a2 --- /dev/null +++ b/src/raven_python/binaries.py @@ -0,0 +1,148 @@ +"""Locate and provision external command-line binaries (BLAST+, DIAMOND, …). + +Shared across tools (not homology-specific). Resolution order for any executable: + + explicit path arg → env var (RAVEN_PYTHON_) → shutil.which (PATH) + → ensure_binary (download the version-pinned ZIP from a raven_python release, + verify SHA256, cache, return the path) + → FileNotFoundError with install guidance + +So a pre-installed/conda binary always wins; the bundled ZIP is the zero-setup +fallback. See docs/maintaining_binaries.md for how the release ZIPs and the +registry are produced and updated. +""" +from __future__ import annotations + +import hashlib +import os +import platform +import shutil +import zipfile +from pathlib import Path +from urllib.request import urlopen + +# Registry of bundled binaries. Empty until release ZIPs are published; populated +# per docs/maintaining_binaries.md. Keyed by *bundle*; one bundle can provide +# several executables (e.g. "blast" -> blastp + makeblastdb). +# bundle -> {version, provides:[exe...], platforms:{"-": {url, sha256}}} +_REGISTRY: dict = {} + +# Environment variable overrides per executable. +_ENV_VARS = { + "diamond": "RAVEN_PYTHON_DIAMOND", + "blastp": "RAVEN_PYTHON_BLASTP", + "makeblastdb": "RAVEN_PYTHON_MAKEBLASTDB", + "hmmbuild": "RAVEN_PYTHON_HMMBUILD", + "hmmpress": "RAVEN_PYTHON_HMMPRESS", + "hmmsearch": "RAVEN_PYTHON_HMMSEARCH", + "hmmscan": "RAVEN_PYTHON_HMMSCAN", + "mafft": "RAVEN_PYTHON_MAFFT", + "cd-hit": "RAVEN_PYTHON_CDHIT", +} + + +def platform_key() -> str: + """Return the ``-`` key used in the registry (e.g. ``linux-x86_64``).""" + system = {"linux": "linux", "darwin": "macos", "windows": "windows"}.get( + platform.system().lower(), platform.system().lower() + ) + machine = platform.machine().lower() + arch = {"x86_64": "x86_64", "amd64": "x86_64", "arm64": "arm64", "aarch64": "arm64"}.get( + machine, machine + ) + return f"{system}-{arch}" + + +def _cache_dir() -> Path: + base = os.environ.get("XDG_CACHE_HOME") or (Path.home() / ".cache") + return Path(base) / "raven_python" / "binaries" + + +def _bundle_for(executable: str, registry: dict): + for name, bundle in registry.items(): + if executable in bundle.get("provides", []): + return name, bundle + return None, None + + +def _sha256(path: Path) -> str: + h = hashlib.sha256() + with open(path, "rb") as fh: + for chunk in iter(lambda: fh.read(1 << 20), b""): + h.update(chunk) + return h.hexdigest() + + +def ensure_binary(executable: str, *, registry: dict | None = None) -> Path: + """Download (if needed) and return the path to a bundled ``executable``. + + Consults the registry for the current platform, downloads the pinned ZIP, + verifies its SHA256, extracts it into the cache, and returns the executable + path. Raises ``FileNotFoundError`` if no bundle for this platform is hosted. + """ + registry = _REGISTRY if registry is None else registry + bundle_name, bundle = _bundle_for(executable, registry) + if bundle is None: + raise FileNotFoundError( + f"No bundled binary registered for {executable!r}. Install it (e.g. " + f"`conda install -c bioconda {executable}`) or pass an explicit path." + ) + key = platform_key() + entry = bundle.get("platforms", {}).get(key) + if entry is None: + raise FileNotFoundError( + f"No bundled {executable!r} for platform {key!r}. Install it " + f"(e.g. `conda install -c bioconda {executable}`), set " + f"{_ENV_VARS.get(executable, 'the binary path')}, or pass binary=." + ) + + dest_dir = _cache_dir() / f"{bundle_name}-{bundle['version']}-{key}" + exe = dest_dir / executable + if exe.exists(): + return exe + + dest_dir.mkdir(parents=True, exist_ok=True) + archive = dest_dir / "_download.zip" + # Download into a sibling .part file and rename on success — an interrupted + # download leaves the partial behind .part, never as a half-complete .zip + # that a later run might mistake for a finished one. Mirrors data.py. + part = archive.with_suffix(archive.suffix + ".part") + try: + with urlopen(entry["url"]) as resp, open(part, "wb") as out: # noqa: S310 + shutil.copyfileobj(resp, out) + digest = _sha256(part) + if digest != entry["sha256"]: + raise ValueError( + f"SHA256 mismatch for {executable!r} ({key}): " + f"expected {entry['sha256']}, got {digest}." + ) + os.replace(part, archive) + finally: + part.unlink(missing_ok=True) + with zipfile.ZipFile(archive) as zf: + zf.extractall(dest_dir) + archive.unlink(missing_ok=True) + if not exe.exists(): + raise FileNotFoundError(f"{executable!r} not found in the extracted bundle at {dest_dir}.") + exe.chmod(0o755) + return exe + + +def resolve_binary(executable: str, *, binary: str | os.PathLike | None = None) -> str: + """Resolve an executable to a path: arg → env var → PATH → bundled ZIP → error.""" + if binary is not None: + return os.fspath(binary) + env_var = _ENV_VARS.get(executable) + if env_var and os.environ.get(env_var): + return os.environ[env_var] + found = shutil.which(executable) + if found: + return found + try: + return os.fspath(ensure_binary(executable)) + except FileNotFoundError as exc: + raise FileNotFoundError( + f"Could not find {executable!r}. Install it (e.g. " + f"`conda install -c bioconda {executable}`), put it on PATH, set " + f"{env_var or 'the binary path'}, or pass binary=. ({exc})" + ) from exc diff --git a/src/raven_python/comparison/__init__.py b/src/raven_python/comparison/__init__.py new file mode 100644 index 0000000..e4b4c19 --- /dev/null +++ b/src/raven_python/comparison/__init__.py @@ -0,0 +1,7 @@ +"""Structural and functional comparison across multiple models. + +See :func:`raven_python.comparison.compare.compare_models`. +""" +from raven_python.comparison.compare import ModelComparison, compare_models + +__all__ = ["ModelComparison", "compare_models"] diff --git a/src/raven_python/comparison/compare.py b/src/raven_python/comparison/compare.py new file mode 100644 index 0000000..c7d38a1 --- /dev/null +++ b/src/raven_python/comparison/compare.py @@ -0,0 +1,149 @@ +"""N-model structural and functional comparison. + +Compare two or more models — typically context-specific models extracted from the same +template — on their reactions, metabolites, genes, subsystems, and (optionally) which +metabolic tasks they perform. Returns tidy :class:`pandas.DataFrame`\\ s suitable for +downstream plotting (heatmaps, tSNE/MDS, …) in seaborn / scikit-learn; plotting is +intentionally not in this function so it stays usable inside pipelines. + +All matrices use the union of ids across the input models as the row index, so missing +entries are unambiguously ``0`` / ``False`` rather than ``NaN``. +""" +from __future__ import annotations + +from collections.abc import Iterable +from dataclasses import dataclass, field + +import cobra +import pandas as pd + +from raven_python.tasks import Task, check_tasks + + +@dataclass +class ModelComparison: + """Tabular result of :func:`compare_models`. + + All matrices are indexed by id (reactions/metabolites/genes/subsystems) with one + column per model. ``presence`` matrices are 0/1; ``subsystems`` is the per-model + reaction count per subsystem. ``similarity`` is the model × model Jaccard on the + reaction set (1 = identical, 0 = disjoint). + """ + + model_ids: list[str] + reactions: pd.DataFrame + metabolites: pd.DataFrame + genes: pd.DataFrame + subsystems: pd.DataFrame + similarity: pd.DataFrame + tasks: pd.DataFrame | None = None # filled iff tasks were supplied + failed_tasks: dict[str, list[str]] = field(default_factory=dict) + + +def _presence_matrix(items_per_model: list[list[str]], model_ids: list[str]) -> pd.DataFrame: + """Build a 0/1 DataFrame: union of items as index × one column per model.""" + ordered: list[str] = [] + seen: set[str] = set() + for items in items_per_model: + for it in items: + if it not in seen: + seen.add(it) + ordered.append(it) + df = pd.DataFrame(0, index=ordered, columns=model_ids, dtype="int8") + for mid, items in zip(model_ids, items_per_model, strict=True): + if items: # avoid empty-list edge case + df.loc[list(set(items) & seen), mid] = 1 + return df + + +def _subsystem_counts(model: cobra.Model) -> dict[str, int]: + """{subsystem_name: reaction_count}. Reactions with empty subsystem fall under '(none)'.""" + counts: dict[str, int] = {} + for r in model.reactions: + # cobra stores subsystem as a string; RAVEN sometimes uses cell-of-cells (we'd + # already have it as a string here, but guard against list/tuple from messy YAML). + sub = r.subsystem + if isinstance(sub, (list, tuple)): + sub = sub[0] if sub else "" + sub = (sub or "").strip() or "(none)" + counts[sub] = counts.get(sub, 0) + 1 + return counts + + +def _jaccard_matrix(presence: pd.DataFrame) -> pd.DataFrame: + """Pairwise Jaccard similarity from a 0/1 presence matrix (rows = items, cols = models).""" + arr = presence.values.astype(bool) + out = pd.DataFrame(0.0, index=presence.columns, columns=presence.columns) + for i, a in enumerate(presence.columns): + ai = arr[:, i] + for j, b in enumerate(presence.columns): + bj = arr[:, j] + inter = int((ai & bj).sum()) + union = int((ai | bj).sum()) + out.loc[a, b] = inter / union if union else 1.0 + return out + + +def compare_models( + models: Iterable[cobra.Model], + *, + tasks: str | Iterable[Task] | None = None, +) -> ModelComparison: + """Compare N cobra models on their reactions / metabolites / genes / subsystems + (and tasks, if provided). + + ``tasks`` is forwarded to :func:`raven_python.tasks.check_tasks` on each model; pass a + file path or a parsed task list. When omitted, ``ModelComparison.tasks`` is ``None``. + + Models are identified by ``model.id`` (with a fallback to ``model_`` if missing + or duplicated). + """ + models_list = list(models) + if len(models_list) < 2: + raise ValueError(f"compare_models needs ≥2 models; got {len(models_list)}") + + # Unique, stable model ids. + model_ids: list[str] = [] + seen: set[str] = set() + for i, m in enumerate(models_list): + mid = (m.id or "").strip() or f"model_{i}" + base, n = mid, 2 + while mid in seen: + mid, n = f"{base}__{n}", n + 1 + seen.add(mid) + model_ids.append(mid) + + reactions = _presence_matrix([[r.id for r in m.reactions] for m in models_list], model_ids) + metabolites = _presence_matrix([[x.id for x in m.metabolites] for m in models_list], model_ids) + genes = _presence_matrix([[g.id for g in m.genes] for m in models_list], model_ids) + + # Subsystems: union of names, per-model reaction counts. + sub_counts = [_subsystem_counts(m) for m in models_list] + sub_ids = sorted({s for c in sub_counts for s in c}) + subsystems = pd.DataFrame(0, index=sub_ids, columns=model_ids, dtype="int32") + for mid, c in zip(model_ids, sub_counts, strict=True): + for s, n in c.items(): + subsystems.at[s, mid] = n + + similarity = _jaccard_matrix(reactions) + + task_df: pd.DataFrame | None = None + failed: dict[str, list[str]] = {} + if tasks is not None: + # raven_python.tasks.check_tasks accepts a path or an iterable of Task; preserve task + # ids for the index. Capture the list once so all models test the same set. + from raven_python.tasks.tasklist import parse_task_list + task_list = (parse_task_list(tasks) if isinstance(tasks, (str, bytes)) + or hasattr(tasks, "__fspath__") else list(tasks)) + task_ids = [t.id for t in task_list] + task_df = pd.DataFrame(False, index=task_ids, columns=model_ids, dtype=bool) + for mid, m in zip(model_ids, models_list, strict=True): + results = check_tasks(m, task_list) + for r in results: + task_df.at[r.id, mid] = bool(r.passed) + if not r.passed and r.error: + failed.setdefault(mid, []).append(f"{r.id}: {r.error}") + + return ModelComparison(model_ids=model_ids, reactions=reactions, metabolites=metabolites, + genes=genes, subsystems=subsystems, similarity=similarity, + tasks=task_df, failed_tasks=failed) diff --git a/src/raven_python/data.py b/src/raven_python/data.py new file mode 100644 index 0000000..b1264be --- /dev/null +++ b/src/raven_python/data.py @@ -0,0 +1,135 @@ +"""Fetch and cache published data artefacts (KEGG reference model, tables, HMMs). + +The mirror of :mod:`raven_python.binaries` for *data*: a version-pinned registry of +downloadable artefacts, fetched on first use, SHA256-verified, and cached under +platformdirs so end users never rebuild them from a KEGG dump (that is the +maintainer's job — see docs/maintaining_kegg_data.md). + +Resolution for any artefact file: + + explicit local dir → cached copy → download from the registry (verify, + cache) → FileNotFoundError with guidance + +The registry is **empty until the artefacts are published** (same as +``binaries._REGISTRY``); until then ``ensure_data_file`` raises an actionable +error. Cache layout:: + + $XDG_CACHE_HOME/raven_python/data/-/ + (or ~/.cache/raven_python/data/... if XDG_CACHE_HOME is unset) +""" +from __future__ import annotations + +import os +import shutil +from pathlib import Path +from urllib.request import urlopen + +from raven_python.binaries import _sha256 + +# dataset -> {"version": str, "files": {filename: {"url": str, "sha256": str}}} +# Populated when raven_python publishes the KEGG artefacts as release assets. +_DATA_REGISTRY: dict = {} + +# The core KEGG artefacts needed to build a model (no HMM libraries). +CORE_KEGG_FILES = ( + "reference_model.yml.gz", + "ko_reaction.tsv.gz", + "ko_names.tsv.gz", + "organism_gene_ko.tsv.xz", + "rxn_flags.tsv.gz", +) + + +def _data_cache_dir() -> Path: + base = os.environ.get("XDG_CACHE_HOME") or (Path.home() / ".cache") + return Path(base) / "raven_python" / "data" + + +def _bundle(dataset: str, registry: dict) -> dict: + bundle = registry.get(dataset) + if bundle is None: + raise FileNotFoundError( + f"No data artefacts registered for {dataset!r}. Either pass a local " + f"directory of artefacts, or build them per docs/maintaining_kegg_data.md." + ) + return bundle + + +def ensure_data_file( + dataset: str, + filename: str, + *, + version: str | None = None, + registry: dict | None = None, +) -> Path: + """Download (if needed) and return the cached path to one artefact file. + + Looks the file up in the registry for ``dataset`` (at ``version`` or the + registry's default), downloads it to the version-pinned cache directory, + verifies its SHA256, and returns the path. Re-uses an already-cached copy. + """ + registry = _DATA_REGISTRY if registry is None else registry + bundle = _bundle(dataset, registry) + ver = version or bundle["version"] + entry = bundle.get("files", {}).get(filename) + if entry is None: + raise FileNotFoundError( + f"{filename!r} is not registered for {dataset!r} {ver}. " + f"Available: {sorted(bundle.get('files', {}))}." + ) + + dest_dir = _data_cache_dir() / f"{dataset}-{ver}" + dest = dest_dir / filename + if dest.exists(): + return dest + + dest_dir.mkdir(parents=True, exist_ok=True) + tmp = dest.with_name(dest.name + ".part") + with urlopen(entry["url"]) as resp, open(tmp, "wb") as out: # noqa: S310 (trusted registry URLs) + shutil.copyfileobj(resp, out) + digest = _sha256(tmp) + if digest != entry["sha256"]: + tmp.unlink(missing_ok=True) + raise ValueError( + f"SHA256 mismatch for {dataset}/{filename} ({ver}): " + f"expected {entry['sha256']}, got {digest}." + ) + tmp.replace(dest) + return dest + + +def ensure_kegg_data( + *, + version: str | None = None, + files: tuple[str, ...] = CORE_KEGG_FILES, + registry: dict | None = None, +) -> Path: + """Ensure the core KEGG artefacts are cached; return their directory. + + Fetches each of ``files`` (default :data:`CORE_KEGG_FILES`) for the ``kegg`` + dataset and returns the cache directory holding them — ready to pass as the + ``artefact_dir`` of :func:`get_kegg_model_for_organism_from_artefacts`. + """ + registry = _DATA_REGISTRY if registry is None else registry + ver = version or _bundle("kegg", registry)["version"] + for filename in files: + ensure_data_file("kegg", filename, version=ver, registry=registry) + return _data_cache_dir() / f"kegg-{ver}" + + +def ensure_kegg_hmm_library( + domain: str, *, version: str | None = None, registry: dict | None = None +) -> Path: + """Ensure a domain HMM library (and its hmmpress index) is cached; return its path. + + ``domain`` is ``"prokaryotes"`` or ``"eukaryotes"``. Fetches ``.hmm`` + plus the ``hmmpress`` sidecar files (``.h3f/.h3i/.h3m/.h3p``) and returns the + path to the ``.hmm`` (the argument for :func:`run_hmmscan`). + """ + registry = _DATA_REGISTRY if registry is None else registry + ver = version or _bundle("kegg", registry)["version"] + base = f"{domain}.hmm" + library = ensure_data_file("kegg", base, version=ver, registry=registry) + for suffix in (".h3f", ".h3i", ".h3m", ".h3p"): + ensure_data_file("kegg", base + suffix, version=ver, registry=registry) + return library diff --git a/src/raven_python/gapfilling/__init__.py b/src/raven_python/gapfilling/__init__.py new file mode 100644 index 0000000..747b293 --- /dev/null +++ b/src/raven_python/gapfilling/__init__.py @@ -0,0 +1,9 @@ +"""Connectivity gap-filling against template models. + +:func:`connect_blocked_reactions` adds the fewest (lowest-penalty) template reactions so +reactions blocked in a draft can carry flux. For the other gap-fill flavour (fill until +the objective is feasible) use ``cobra.flux_analysis.gapfill``. +""" +from raven_python.gapfilling.fill import GapFillResult, connect_blocked_reactions + +__all__ = ["GapFillResult", "connect_blocked_reactions"] diff --git a/src/raven_python/gapfilling/fill.py b/src/raven_python/gapfilling/fill.py new file mode 100644 index 0000000..ba3418d --- /dev/null +++ b/src/raven_python/gapfilling/fill.py @@ -0,0 +1,172 @@ +"""Connectivity gap-filling: add the fewest template reactions so reactions that are +*blocked* in a draft can carry flux. + +For the other gap-filling flavour (add the fewest template reactions until the model's +own objective becomes feasible) use ``cobra.flux_analysis.gapfill`` — just align the +template's metabolite ids to the draft first, since cobra matches by id. + +It solves an MILP: pick the minimum-penalty subset of template reactions such that the +blocked (irreversible) draft reactions can carry flux at steady state. Template +metabolites are matched to the draft by ``name[compartment]`` (via +:func:`add_reactions_from_model`), so templates in a different identifier namespace +than the model still work. Per-reaction ``scores`` (higher = prefer to include) map to +RAVEN's ``rxnScores``; the MILP minimises the penalty ``-score`` (default penalty +``1.0``, i.e. minimise the number of reactions added). +""" +from __future__ import annotations + +from collections.abc import Iterable +from dataclasses import dataclass + +import cobra +from cobra.flux_analysis import find_blocked_reactions, flux_variability_analysis + +from raven_python.manipulation.transfer import add_reactions_from_model + + +@dataclass +class GapFillResult: + """Outcome of a connectivity gap-fill. + + ``added_reactions`` are the template reaction ids added to ``model``; + ``newly_connected`` are draft reactions that were blocked but can now carry flux; + ``cannot_connect`` are blocked reactions left unconnectable. + """ + + added_reactions: list[str] + newly_connected: list[str] + cannot_connect: list[str] + model: cobra.Model + + +def _as_models(templates: cobra.Model | Iterable[cobra.Model]) -> list[cobra.Model]: + return [templates] if isinstance(templates, cobra.Model) else list(templates) + + +def _merge_templates(model: cobra.Model, templates: list[cobra.Model]) -> tuple[cobra.Model, list[str]]: + """Copy every template reaction (new ones only) into a working copy of ``model``. + + Returns the working model and the ids of the reactions that came from templates + (the gap-fill candidates). Metabolites are matched by ``name[compartment]``. + """ + working = model.copy() + template_ids: list[str] = [] + for template in templates: + new = [r.id for r in template.reactions if r.id not in working.reactions] + if new: + added = add_reactions_from_model(working, template, new, genes=False, note=None) + template_ids += [r.id for r in added] + return working, template_ids + + +def _solve_min_templates( + working: cobra.Model, + template_ids: list[str], + *, + scores: dict[str, float] | None, + penalty: float, + allow_net_production: bool, +) -> set[str] | None: + """MILP: minimum-penalty template reactions making ``working`` feasible. + + The requirement (here, forced flux through the blocked reactions) must already be + imposed on ``working``. Returns the template reaction ids to keep, or ``None`` if + the problem is infeasible. + """ + prob = working.problem + indicators: dict[str, object] = {} + extra = [] + for rid in template_ids: + rxn = working.reactions.get_by_id(rid) + y = prob.Variable(f"_gf_keep_{rid}", type="binary") + indicators[rid] = y + # Flux is confined to [lb*y, ub*y]: zero unless the reaction is kept (y=1). + extra.append(prob.Constraint(rxn.flux_expression - rxn.upper_bound * y, ub=0, name=f"_gf_ub_{rid}")) + extra.append(prob.Constraint(rxn.flux_expression - rxn.lower_bound * y, lb=0, name=f"_gf_lb_{rid}")) + working.add_cons_vars(list(indicators.values()) + extra) + + if allow_net_production: # relax steady state to Sv >= 0 (mets may accumulate) + for met in working.metabolites: + working.constraints[met.id].ub = None + + def pen(rid: str) -> float: + return -scores[rid] if scores and rid in scores else penalty + + working.objective = prob.Objective( + sum(pen(rid) * indicators[rid] for rid in template_ids), direction="min" + ) + working.slim_optimize() + if working.solver.status != "optimal": + return None + return {rid for rid, y in indicators.items() if (y.primal or 0) > 0.5} + + +def _build_filled(model: cobra.Model, templates: list[cobra.Model], chosen: set[str]) -> cobra.Model: + filled = model.copy() + remaining = set(chosen) + for template in templates: + ids = [r for r in remaining if r in template.reactions] + if ids: + add_reactions_from_model(filled, template, ids, genes=False, note="Added by connect_blocked_reactions") + remaining -= set(ids) + return filled + + +def connect_blocked_reactions( + model: cobra.Model, + templates: cobra.Model | Iterable[cobra.Model], + *, + scores: dict[str, float] | None = None, + penalty: float = 1.0, + allow_net_production: bool = False, + eps: float = 1.0, +) -> GapFillResult: + """Add template reactions so blocked draft reactions can carry flux. + + Finds reactions that + cannot carry flux in ``model``, then adds the minimum-penalty set of template + reactions that lets the (irreversible) ones carry flux, and returns the filled + model. Like RAVEN, only irreversible blocked reactions are forced — reversible + ones can carry flux trivially in the split formulation, so forcing them is + uninformative. + + For the *other* gap-filling flavour — adding reactions to make the model's + objective feasible — use ``cobra.flux_analysis.gapfill`` after aligning the + template's metabolite ids to the draft. + + The draft is expected to have exchange reactions for its nutrients (otherwise most + reactions are trivially blocked). + """ + templates = _as_models(templates) + blocked = set(find_blocked_reactions(model)) + candidates = [r for r in blocked if model.reactions.get_by_id(r).lower_bound >= 0] + + working, template_ids = _merge_templates(model, templates) + + target: list[str] = [] + if candidates: + fva = flux_variability_analysis(working, reaction_list=candidates, fraction_of_optimum=0.0) + # A reaction can be missing from the FVA frame if the solver dropped it + # (e.g. the reaction was eliminated upstream); treat that as "unreachable" + # rather than letting the KeyError propagate. + target = [ + r for r in candidates + if r in fva.index and fva.at[r, "maximum"] > eps + ] + + cannot = sorted(blocked - set(target)) + if not target: + return GapFillResult([], [], cannot, model.copy()) + + for rid in target: + working.reactions.get_by_id(rid).lower_bound = eps + chosen = _solve_min_templates( + working, template_ids, scores=scores, penalty=penalty, + allow_net_production=allow_net_production, + ) + if chosen is None: + raise RuntimeError( + "Gap-filling is infeasible: the blocked reactions cannot all carry flux " + "even with every template reaction added." + ) + return GapFillResult(sorted(chosen), sorted(target), cannot, _build_filled(model, templates, chosen)) diff --git a/src/raven_python/init/__init__.py b/src/raven_python/init/__init__.py new file mode 100644 index 0000000..040f299 --- /dev/null +++ b/src/raven_python/init/__init__.py @@ -0,0 +1,46 @@ +"""Context-specific model extraction (tINIT / ftINIT). + +tINIT: +* :func:`run_init` — the classic INIT MILP. +* :func:`score_reactions_from_genes` / :func:`gene_scores_from_expression` — + gene → reaction scoring (RNA-seq is the common upstream). +* :func:`get_init_model` — the tINIT pipeline (dead-end removal + ``run_init``). + +ftINIT (faster, staged): +* :func:`run_ftinit` — the single-step ftINIT MILP (continuous indicators for + positive-score reactions; binaries only on negatives — the speedup over ``run_init``). +* :func:`ftinit` — the full pipeline (``prep_init_model`` → staged ``run_ftinit`` → + ``fill_tasks`` → ``remove_low_score_genes``). +""" +from raven_python.init.build import InitModelResult, get_init_model +from raven_python.init.ftinit import FtInitResult, ftinit, run_ftinit +from raven_python.init.genes import remove_low_score_genes +from raven_python.init.init import InitResult, run_init +from raven_python.init.merge import group_rxn_scores, merge_linear +from raven_python.init.prep import PrepData, ReactionMasks, classify_reactions, prep_init_model +from raven_python.init.score import gene_scores_from_expression, score_reactions_from_genes +from raven_python.init.steps import InitStep, get_init_steps +from raven_python.init.taskfill import TaskFillResult, fill_tasks + +__all__ = [ + "FtInitResult", + "InitModelResult", + "InitResult", + "InitStep", + "PrepData", + "ReactionMasks", + "TaskFillResult", + "classify_reactions", + "fill_tasks", + "ftinit", + "gene_scores_from_expression", + "get_init_model", + "get_init_steps", + "group_rxn_scores", + "merge_linear", + "prep_init_model", + "remove_low_score_genes", + "run_ftinit", + "run_init", + "score_reactions_from_genes", +] diff --git a/src/raven_python/init/build.py b/src/raven_python/init/build.py new file mode 100644 index 0000000..a0d0538 --- /dev/null +++ b/src/raven_python/init/build.py @@ -0,0 +1,113 @@ +"""tINIT model building — high-level pipeline. + +Turn expression-derived scores into reaction scores (via the GPR), drop reactions that +cannot carry flux, then run the INIT MILP to extract a context-specific model. Pass +gene scores (typically from :func:`gene_scores_from_expression` or one of the omics +loaders) or reaction scores directly. ``essential_rxns`` are forced kept. + +For task-aware gap-filling on top of the resulting model, use ftINIT +(:func:`raven_python.init.ftinit`); ``get_init_model`` itself does not run the task layer. +""" +from __future__ import annotations + +from collections.abc import Iterable, Mapping +from dataclasses import dataclass + +import cobra +from cobra.flux_analysis import find_blocked_reactions + +from raven_python.init.init import run_init +from raven_python.init.score import score_reactions_from_genes + + +@dataclass +class InitModelResult: + """Result of :func:`get_init_model`.""" + + model: cobra.Model + reaction_scores: dict[str, float] + deleted_dead_end_reactions: list[str] + deleted_in_init: list[str] + met_production: dict[str, bool] + objective: float + + +def get_init_model( + ref_model: cobra.Model, + *, + rxn_scores: Mapping[str, float] | None = None, + gene_scores: Mapping[str, float] | None = None, + isozyme_scoring: str = "max", + complex_scoring: str = "min", + no_gene_score: float = -2.0, + essential_rxns: Iterable[str] | None = None, + present_mets: Iterable[str] | None = None, + prod_weight: float = 0.5, + allow_excretion: bool = True, + no_rev_loops: bool = False, + remove_dead_ends: bool = True, + eps: float = 1.0, + big_m: float | None = None, + mip_gap: float | None = None, + time_limit: float | None = None, +) -> InitModelResult: + """Extract a context-specific model with tINIT. + + Provide either ``rxn_scores`` (reaction id → score) or ``gene_scores`` (gene id → + score, converted via the GPR with :func:`score_reactions_from_genes`). Reactions + that cannot carry flux (with exchanges open) are removed first unless + ``remove_dead_ends=False``; ``essential_rxns`` are kept regardless. The remaining + model is passed to :func:`run_init`. + """ + if (rxn_scores is None) == (gene_scores is None): + raise ValueError("Provide exactly one of rxn_scores or gene_scores.") + + model = ref_model.copy() + essential = set(essential_rxns or []) + if gene_scores is not None: + scores = score_reactions_from_genes( + model, gene_scores, isozyme_scoring=isozyme_scoring, + complex_scoring=complex_scoring, no_gene_score=no_gene_score, + ) + else: + scores = dict(rxn_scores) + + deleted_dead_end: list[str] = [] + if remove_dead_ends: + # Identify and drop reactions that cannot carry flux even under the + # *most permissive* boundary regime: every metabolite open for excretion + # (when ``allow_excretion``) plus the exchange-opened FVA. That makes + # the pre-filter conservative — only reactions blocked under both lax + # and strict regimes are removed, so the strict run_init path never + # loses a candidate it could have used. + probe = model.copy() + original_ids = {r.id for r in model.reactions} + if allow_excretion: + has_boundary = {m.id for r in probe.boundary for m in r.metabolites} + for met in list(probe.metabolites): + if met.id not in has_boundary: + probe.add_boundary(met, type="demand") + blocked = set(find_blocked_reactions(probe, open_exchanges=True)) + deleted_dead_end = sorted((blocked & original_ids) - essential) + model.remove_reactions(deleted_dead_end, remove_orphans=True) + + result = run_init( + model, scores, + present_mets=present_mets, + essential_rxns=essential & {r.id for r in model.reactions}, + prod_weight=prod_weight, + allow_excretion=allow_excretion, + no_rev_loops=no_rev_loops, + eps=eps, + big_m=big_m, + mip_gap=mip_gap, + time_limit=time_limit, + ) + return InitModelResult( + model=result.model, + reaction_scores=scores, + deleted_dead_end_reactions=deleted_dead_end, + deleted_in_init=result.deleted_reactions, + met_production=result.met_production, + objective=result.objective, + ) diff --git a/src/raven_python/init/ftinit.py b/src/raven_python/init/ftinit.py new file mode 100644 index 0000000..b355e45 --- /dev/null +++ b/src/raven_python/init/ftinit.py @@ -0,0 +1,328 @@ +"""The ftINIT MILP — the faster staged variant of INIT. + +ftINIT keeps tINIT's objective — pick the reaction subset best matching expression +scores while staying flux-consistent — but with a cheaper MILP encoding that is the +reason it is *fast*: a **positive-score reaction needs no binary**. Because the +objective *maximises* ``Σ score·y`` with ``score > 0``, the optimiser pushes its +continuous indicator ``y ∈ [0,1]`` to 1, and the gate ``net_flux ≥ force_on·y`` only +lets ``y`` reach 1 if the reaction can actually carry flux. Only *negative*-score +reactions need a true ``{0,1}`` binary (their indicator would otherwise sit at 0 for +free). This roughly halves the integer count — the dominant MILP cost. + +Reaction categories (RAVEN's six), by score sign × reversibility: + +* **score 0** — left in the model, *not* in the problem: a free flux variable that can + carry flux for connectivity but is neither scored nor removable. +* **positive, irreversible** — continuous ``y∈[0,1]``; ``v ≥ force_on·y``. No binary. +* **positive, reversible** — split ``v = v⁺ − v⁻``; continuous ``y``; a single + direction binary keeps one of ``v⁺/v⁻`` at 0 (no fwd/back loop faking "on"); + ``v⁺+v⁻ ≥ force_on·y``. +* **negative, irreversible** — binary ``x∈{0,1}``; ``v ≤ ub·x``. +* **negative, reversible** — split; binary ``x``; ``v⁺+v⁻ ≤ cap·x``. +* **essential** — forced on (``v ≥ force_on_ess``); no indicator. Assumed already + oriented irreversible in its forced direction (``prepINITModel`` does this). + +Objective: **maximise** ``Σ score·indicator``. Unlike classic INIT +(:func:`raven_python.init.run_init`), ftINIT does **not** reward production of every +metabolite — ``prod_weight`` applies only to metabolomics-detected metabolites (not +yet implemented; passing a non-empty ``metabolomics`` argument raises +``NotImplementedError``). Connectivity comes solely from the flux gates plus any +essential reactions. ``allow_excretion`` relaxes ``S·v = 0`` to ``≥ 0``; ``rem_pos_rev`` +drops positive reversible reactions from the problem (used in the staging schedule). + +Needs a MILP solver (cobra's configured optlang solver; only Gurobi is fully viable at +genome scale — see ``docs/init_solver_benchmark.md``). Magic numbers +(``force_on``/``force_on_ess`` = 0.1, ``big_m`` = 100) are exposed and scale-dependent; +calibration tables are in ``docs/init_param_calibration.md``. ``big_m`` caps a *scored* +reaction's flux in its on/off (direction) constraint — using a fixed 100 rather than +the reaction's ±1000 bound keeps the LP relaxation tight (what makes the genome-scale +MILP tractable). Free / essential reactions keep their real bounds. + +⚠️ **Loops.** The MILP has *no* loopless constraint: an internal +thermodynamically-infeasible cycle is flux-consistent (``S·v = 0``), so if its +reactions carry positive net score the optimiser will "include" them with no real +exchange flux. RAVEN tolerates this — loop-free models come from the staged pipeline ++ exchange handling, and at genome scale real exchange reactions make such cycles not +score-optimal. A loopless option could be layered on later if needed. +""" +from __future__ import annotations + +from collections.abc import Iterable, Mapping +from dataclasses import dataclass, field + +import cobra +from optlang.symbolics import Real, add, mul + +from raven_python.init.genes import remove_low_score_genes +from raven_python.init.merge import group_rxn_scores +from raven_python.init.steps import get_init_steps +from raven_python.init.taskfill import fill_tasks + +_FORCE_ON = 0.1 # min flux for a reaction to count as "on" (RAVEN forceOnLim) +_BIG_M = 100.0 # indicator/direction big-M cap on a *scored* reaction's flux (RAVEN's 100) + + +@dataclass +class FtInitResult: + """Result of :func:`run_ftinit`.""" + + model: cobra.Model + kept_reactions: list[str] + deleted_reactions: list[str] + fluxes: dict[str, float] + objective: float + on_reactions: set[str] = field(default_factory=set) # scored reactions turned on (indicator) + + +def run_ftinit( + model: cobra.Model, + rxn_scores: Mapping[str, float] | None = None, + *, + essential_rxns: Iterable[str] | None = None, + essential_directions: Mapping[str, int] | None = None, + essential_force: Mapping[str, float] | None = None, + allow_excretion: bool = False, + rem_pos_rev: bool = False, + ignore_mets: Iterable[str] = (), + force_on: float = _FORCE_ON, + force_on_ess: float = _FORCE_ON, + big_m: float = _BIG_M, + mip_gap: float | None = None, + time_limit: float | None = None, +) -> FtInitResult: + """Run the single-step ftINIT MILP and return the extracted model. + + ``rxn_scores`` maps reaction id → score (default 0 → reaction left free in the + model, not scored or removable). ``essential_rxns`` are forced to carry flux + (≥ ``force_on_ess``); ``essential_directions`` maps an essential reaction id to + ``+1`` (forward) or ``-1`` (reverse) for the forced direction (default forward). + ``ignore_mets`` are metabolite **names** whose mass balance is dropped (RAVEN's + per-step "simple metabolite" removal, e.g. H2O/H+). See the module docstring for + the formulation. This is the single-step variant; the staged schedule + (:func:`raven_python.init.ftinit`) calls it per step. + """ + scores = dict(rxn_scores or {}) + essential = set(essential_rxns or []) + directions = dict(essential_directions or {}) + essential_force = dict(essential_force or {}) + ignore_met_names = set(ignore_mets) + prob = model.problem + opt = prob.Model() + + variables: list = [] + constraints: list = [] + flux_terms: dict[str, list[tuple[object, float]]] = {} # rxn id -> [(var, sign)] + indicators: dict[str, tuple[object, float]] = {} # rxn id -> (indicator var, score) + free_or_essential: set[str] = set() # kept regardless of an indicator + + def add_constraint(expr, **kw): + constraints.append(prob.Constraint(expr, **kw)) + + for rxn in model.reactions: + rid = rxn.id + lb, ub = rxn.lower_bound, rxn.upper_bound + score = float(scores.get(rid, 0.0)) + if rem_pos_rev and score > 0 and lb < 0 < ub: + score = 0.0 # staging step 1: positive reversibles dropped from the problem + + if rid in essential: + # Forced to carry flux in its forced direction (default forward); respect a + # stricter native bound if the model already forces more flux. The forced + # magnitude may be set per reaction (RAVEN's min(0.99·|prev flux|, 0.1), so + # a reaction is never forced above what it carried before). + force = essential_force.get(rid, force_on_ess) if essential_force else force_on_ess + if directions.get(rid, 1) >= 0: + forced = min(force, ub) # clamp to capacity so we never make lb > ub + v = prob.Variable(f"v_{rid}", lb=max(forced, lb, 0.0), ub=ub) + else: # reverse: flux ≤ -force + forced = min(force, -lb) + v = prob.Variable(f"v_{rid}", lb=lb, ub=min(-forced, ub)) + variables.append(v) + flux_terms[rid] = [(v, 1.0)] + free_or_essential.add(rid) + continue + + if score == 0.0: # free: carries flux for connectivity, not scored/removable + v = prob.Variable(f"v_{rid}", lb=lb, ub=ub) + variables.append(v) + flux_terms[rid] = [(v, 1.0)] + free_or_essential.add(rid) + continue + + reversible = lb < 0 < ub + if reversible: + vp = prob.Variable(f"vp_{rid}", lb=0.0, ub=ub) + vn = prob.Variable(f"vn_{rid}", lb=0.0, ub=-lb) + variables += [vp, vn] + flux_terms[rid] = [(vp, 1.0), (vn, -1.0)] + total = vp + vn # |flux| (one of vp/vn pinned to 0 below), used by the gates + else: # single-direction: keep the model's own [lb, ub] (incl. any forced lb>0) + v = prob.Variable(f"v_{rid}", lb=lb, ub=ub) + variables.append(v) + flux_terms[rid] = [(v, 1.0)] + total = v if ub > 0 else -v # magnitude for a single-direction reaction + + if score > 0: + y = prob.Variable(f"y_{rid}", lb=0.0, ub=1.0) # continuous indicator, no binary + variables.append(y) + indicators[rid] = (y, score) + add_constraint(total - force_on * y, lb=0.0, name=f"on_{rid}") # y=1 ⇒ |flux| ≥ force_on + if reversible: # one direction binary stops a fwd/back loop faking "on" + b = prob.Variable(f"b_{rid}", type="binary") + variables.append(b) + add_constraint(vp - big_m * b, ub=0.0, name=f"dirp_{rid}") # vp ≤ M·b + add_constraint(vn + big_m * b, ub=big_m, name=f"dirn_{rid}") # vn ≤ M·(1-b) + else: # score < 0 + x = prob.Variable(f"x_{rid}", type="binary") + variables.append(x) + indicators[rid] = (x, score) + add_constraint(total - big_m * x, ub=0.0, name=f"off_{rid}") # flux>0 ⇒ x=1 + + # Steady state S·v {== 0 | >= 0}; ignored metabolites are left unbalanced. + # Build each metabolite's balance as a *flat* list of (coeff·sign)·var terms and sum + # it with optlang.symbolics.add. Python's builtin sum re-canonicalises a growing + # sympy expression at every step (O(n²)); for hub metabolites that appear in ~10³ + # reactions that is minutes per constraint. add() builds the sum in one pass. + met_terms: dict = {m: [] for m in model.metabolites if m.name not in ignore_met_names} + for rxn in model.reactions: + terms = flux_terms[rxn.id] + for met, coeff in rxn.metabolites.items(): + bucket = met_terms.get(met) + if bucket is None: + continue + for var, sign in terms: + bucket.append(mul([Real(coeff * sign), var])) + for termlist in met_terms.values(): + if termlist: + add_constraint(add(termlist), lb=0.0, ub=None if allow_excretion else 0.0) + + opt.add(variables + constraints) + opt.objective = prob.Objective( + add([mul([Real(score), ind]) for ind, score in indicators.values()]), direction="max" + ) + if time_limit is not None: + opt.configuration.timeout = int(time_limit) + if mip_gap is not None: + try: # Gurobi-specific; harmless if the backend differs + opt.problem.Params.MIPGap = mip_gap + except Exception: # noqa: BLE001 + pass + opt.optimize() + # Accept a near-optimal incumbent (when a MIP gap / time limit is set), as RAVEN does. + if opt.status not in ("optimal", "feasible", "suboptimal", "time_limit"): + raise RuntimeError(f"ftINIT MILP did not solve (status: {opt.status}).") + + # RAVEN: a reaction is "on" iff its indicator ≥ 0.5 (positive indicators are + # continuous and can land fractionally when a reaction can carry only tiny flux). + on = {rid for rid, (ind, _) in indicators.items() if (ind.primal or 0.0) >= 0.5} + kept = free_or_essential | on + deleted = [r.id for r in model.reactions if r.id not in kept] + fluxes = { + rid: sum(sign * (var.primal or 0.0) for var, sign in terms) + for rid, terms in flux_terms.items() + } + + out = model.copy() + out.remove_reactions(deleted, remove_orphans=True) + return FtInitResult(out, sorted(kept), sorted(deleted), fluxes, + float(opt.objective.value), on_reactions=on) + + +def ftinit( + prep, + rxn_scores: Mapping[str, float], + *, + gene_scores: Mapping[str, float] | None = None, + series: str = "1+1", + steps=None, + fill_gaps: bool = True, + metabolomics: Iterable[str] | None = None, + force_on: float = _FORCE_ON, + big_m: float = _BIG_M, + mip_gap: float | None = None, + time_limit: float | None = None, +) -> cobra.Model: + """Run the full ftINIT pipeline on prepData and return the context-specific model. + + ``prep`` is a :class:`raven_python.init.PrepData`. ``rxn_scores`` maps **original** + reaction id → score (e.g. from :func:`score_reactions_from_genes` on the template). + Each step (:func:`raven_python.init.get_init_steps`) regroups scores under its + ``ignore_mask``, fixes the reactions turned on by earlier steps as essential (in + their flux direction), and solves :func:`run_ftinit` on the merged model. Reactions + never turned on (and not essential or left-in) are removed from the reference model; + exchange reactions are always kept (RAVEN re-adds them). + + If ``fill_gaps`` and ``prep`` carries tasks, reactions are added back so every task + is feasible (:func:`raven_python.init.fill_tasks`). If ``gene_scores`` is given, + negative-scoring genes are pruned from the GPRs at the end + (:func:`raven_python.init.remove_low_score_genes`). + + Essential reactions are forced to carry ``force_on`` (default 0.1) of flux in the + forced direction. On genome-scale models a stricter regime is needed (the previous + step's actual carried flux instead of a flat 0.1) — exposed via per-reaction + ``essential_force`` on :func:`run_ftinit`. + + ``metabolomics`` (a list of detected metabolite names to reward producing) is + **not yet implemented**: the linear merge eliminates degree-2 detected metabolites, + so it needs a producer-group-mapping + negative-producer force-flux block — the + most intricate MILP piece, for the least-used input. Passing a non-empty value + raises ``NotImplementedError``. + + ``mip_gap``/``time_limit`` are forwarded to each :func:`run_ftinit` solve. On + genome-scale models they are essential for tractability — see + ``docs/init_param_calibration.md`` for the calibration table. + """ + if metabolomics: + raise NotImplementedError( + "metabolomics production-bonus is not yet implemented." + ) + steps = steps if steps is not None else get_init_steps(series) + min_model, group_of = prep.min_model, prep.group_of + + turned_on: dict[str, float] = {} # merged reaction id -> flux (accumulated) + left_in: set[str] = set() # merged reactions with score 0 in the last step + for step in steps: + to_zero = prep.masks.ignored(step.ignore_mask) + scores = group_rxn_scores(min_model, rxn_scores, prep.orig_rxn_ids, + prep.group_ids, to_zero) + essential = set(prep.essential_rxns) # pre-oriented forward (default direction) + directions: dict[str, int] = {} + ess_force: dict[str, float] = {} + if step.how_to_use_prev == "essential": + for rid, flux in turned_on.items(): + essential.add(rid) + directions[rid] = 1 if flux >= 0 else -1 + # never force more flux than the reaction carried before (RAVEN) + ess_force[rid] = min(abs(flux) * 0.99, force_on) + res = run_ftinit( + min_model, scores, essential_rxns=essential, essential_directions=directions, + essential_force=ess_force, allow_excretion=step.allow_met_secr, + rem_pos_rev=step.pos_rev_off, ignore_mets=step.mets_to_ignore, + force_on=force_on, force_on_ess=force_on, big_m=big_m, + mip_gap=mip_gap, time_limit=time_limit, + ) + for rid in res.on_reactions: + turned_on[rid] = res.fluxes[rid] + left_in = {rid for rid, s in scores.items() if s == 0.0} + + # Merged reactions to keep: turned on + permanently essential + left-in (score 0). + kept_min = set(turned_on) | set(prep.essential_rxns) | left_in + deleted_min = [r.id for r in min_model.reactions if r.id not in kept_min] + + # Map deleted merged reactions back to all originals in their groups. + removed_groups = {group_of[rid] for rid in deleted_min if group_of[rid] != 0} + to_remove = {o for o in prep.orig_rxn_ids if group_of[o] and group_of[o] in removed_groups} + to_remove |= {rid for rid in deleted_min if group_of[rid] == 0} # unmerged + # Keep the surviving originals plus all exchange reactions (always re-added). + final_kept = (set(prep.orig_rxn_ids) - to_remove) | prep.masks.exchange + + out = prep.ref_model.copy() + out.remove_reactions([r.id for r in out.reactions if r.id not in final_kept], + remove_orphans=True) + + if fill_gaps and prep.tasks: # add reactions back so every task is feasible + out = fill_tasks(out, prep.ref_model, prep.tasks, rxn_scores=rxn_scores, + mip_gap=mip_gap, time_limit=time_limit).model + if gene_scores is not None: # prune negative-scoring genes from the GPRs + out, _ = remove_low_score_genes(out, gene_scores) + return out diff --git a/src/raven_python/init/genes.py b/src/raven_python/init/genes.py new file mode 100644 index 0000000..ceed3da --- /dev/null +++ b/src/raven_python/init/genes.py @@ -0,0 +1,85 @@ +"""Prune low-scoring genes from a model — the last ftINIT step. + +Drop negative-scoring genes from each reaction's GPR, while +respecting enzyme structure — genes joined by **OR** (isozymes) are candidates for +removal, but at least one must remain (the least-negative if all are negative); +genes joined by **AND** (complex subunits) are *not* removed individually, though a +whole complex can be dropped as one isozyme alternative if its (aggregated) score is +negative. Operates on cobra's GPR AST recursively, so nested rules like +``G1 and (G2 or G3) and G4`` prune the inner isozyme group correctly. +""" +from __future__ import annotations + +import ast +import statistics +from collections.abc import Mapping + +import cobra +from cobra.manipulation import remove_genes + +_AGG = {"min": min, "max": max, "median": statistics.median, "average": statistics.fmean} + + +def _prune(node, scores, iso, cplx) -> tuple[str | None, float | None]: + """Return (pruned GPR string, aggregate score) for an AST node, or (None, None).""" + if isinstance(node, ast.Name): + return node.id, scores.get(node.id) # None = unscored (NaN: never removed) + if not isinstance(node, ast.BoolOp): + return None, None + + children = [_prune(v, scores, iso, cplx) for v in node.values] + children = [(s, sc) for s, sc in children if s is not None] + + if isinstance(node.op, ast.And): # complex: keep every subunit, prune nested ORs + kept = children + else: # OR / isozymes: drop negative-scoring alternatives, keep at least one + kept = [(s, sc) for s, sc in children if sc is None or sc >= 0] + if not kept: # all negative → keep the least-negative + kept = [max(children, key=lambda c: c[1])] + + parts = [s for s, _ in kept] + score_vals = [sc for _, sc in kept if sc is not None] + agg = (cplx if isinstance(node.op, ast.And) else iso) + score = agg(score_vals) if score_vals else None + op = " and " if isinstance(node.op, ast.And) else " or " + text = parts[0] if len(parts) == 1 else "(" + op.join(parts) + ")" + return text, score + + +def remove_low_score_genes( + model: cobra.Model, + gene_scores: Mapping[str, float], + *, + isozyme_scoring: str = "max", + complex_scoring: str = "min", +) -> tuple[cobra.Model, list[str]]: + """Remove negative-scoring genes from GPRs (RAVEN ``removeLowScoreGenes``). + + ``gene_scores`` maps gene id → score; genes absent from it are treated as unscored + (never removed). Returns ``(new_model, removed_gene_ids)`` — genes dropped from + *every* rule they were in (and thus from the model). ``isozyme_scoring`` / + ``complex_scoring`` aggregate alternative/subunit scores (``max``/``min`` default). + + When all isozyme alternatives are negative the least-negative one is kept + **deterministically** (first on a tie), unlike RAVEN's random tie-break — same + quality, reproducible. + """ + for name, value in (("isozyme_scoring", isozyme_scoring), ("complex_scoring", complex_scoring)): + if value not in _AGG: + raise ValueError(f"{name} must be one of {sorted(_AGG)}; got {value!r}.") + iso, cplx = _AGG[isozyme_scoring], _AGG[complex_scoring] + + out = model.copy() + for rxn in out.reactions: + body = rxn.gpr.body + if body is None or not rxn.genes: + continue + pruned, _ = _prune(body, gene_scores, iso, cplx) + if pruned is not None: + rxn.gene_reaction_rule = pruned + + used = {g.id for rxn in out.reactions for g in rxn.genes} + removed = sorted(g.id for g in out.genes if g.id not in used) + if removed: + remove_genes(out, removed, remove_reactions=False) + return out, removed diff --git a/src/raven_python/init/init.py b/src/raven_python/init/init.py new file mode 100644 index 0000000..f23e17a --- /dev/null +++ b/src/raven_python/init/init.py @@ -0,0 +1,254 @@ +"""The INIT MILP — tINIT core. + +INIT (Agren et al., PLoS Comput Biol 2012) extracts a context-specific model: keep a +flux-consistent subnetwork that maximises the summed score of *included* reactions +(positive score = evidence to keep, negative = evidence to remove), optionally +rewarding net production of metabolites. + +Formulation: + +* Reversible reactions are split into forward / reverse directed reactions (flux ≥ 0). +* Each non-essential directed reaction gets a binary ``x`` (included ⇔ ``x=1``) with + ``eps·x ≤ v ≤ ub·x`` — included reactions must carry flux ≥ ``eps`` (connectivity), + excluded ones carry none. +* Essential reactions (``essential_rxns``) are forced to carry flux (``v ≥ eps``) and + skip the binary. +* ``no_rev_loops`` adds ``x_fwd + x_rev ≤ 1`` so a reversible reaction can't look + "connected" via an internal forward/back loop. +* Steady state ``S·v = 0`` per metabolite; ``allow_excretion`` relaxes it to ``≥ 0`` + (net production allowed). With ``prod_weight > 0`` a per-metabolite sink + ``s_m ∈ [0,1]`` is added and rewarded, giving a reason to include connectivity + reactions. +* Objective: **maximise** ``Σ score·x + prod_weight·Σ s_m``. + +Needs a MILP solver (cobra's configured optlang solver). On genome-scale problems, +Gurobi is the only backend that is fully usable today (see +``docs/init_solver_benchmark.md``). + +**Parameter caveat — magic numbers are scale-dependent.** ``eps`` (the flux an +included reaction must carry, default 1.0) and ``prod_weight`` (default 0.5) only make +sense when reaction bounds are ~±1000 and scores are O(1); the right values depend on +the model's flux magnitudes and the score distribution. The upper gate uses each +reaction's own ``ub`` as the big-M by default (adapts to the model); pass ``big_m`` to +override with a fixed cap for a tighter LP relaxation. Calibration tables live in +``docs/init_param_calibration.md``. +""" +from __future__ import annotations + +from collections.abc import Iterable, Mapping +from dataclasses import dataclass + +import cobra +from optlang.symbolics import Real, add, mul + +_EPS = 1.0 # flux an included reaction must carry (RAVEN's fake-met unit) + + +@dataclass +class _Directed: + """One directed reaction in the split (irreversible) problem.""" + + key: str + origin: str # original reaction id + coeffs: dict[str, float] # met id -> stoichiometry (already sign-adjusted) + ub: float + score: float + essential: bool + + +@dataclass +class InitResult: + """Result of :func:`run_init`.""" + + model: cobra.Model + deleted_reactions: list[str] + met_production: dict[str, bool] # present-met name -> producible? + objective: float + + +def _split_reactions( + model: cobra.Model, scores: Mapping[str, float], essential: set[str] +) -> list[_Directed]: + directed: list[_Directed] = [] + for rxn in model.reactions: + score = float(scores.get(rxn.id, 0.0)) + coeffs = {m.id: c for m, c in rxn.metabolites.items()} + rev_coeffs = {m: -c for m, c in coeffs.items()} + if rxn.id in essential: + # Force flux in a *single* direction (forward if it can run forward, else + # reverse) — like an irreversible essential reaction. Emitting both halves + # as essential would force fwd ≥ eps AND rev ≥ eps, i.e. a phantom + # eps-magnitude self-loop that can starve out the real pathway. + if rxn.upper_bound > 0: + directed.append(_Directed(rxn.id, rxn.id, coeffs, rxn.upper_bound, score, True)) + else: + directed.append(_Directed(f"{rxn.id}__rev", rxn.id, rev_coeffs, + -rxn.lower_bound, score, True)) + continue + if rxn.upper_bound > 0: + directed.append(_Directed(rxn.id, rxn.id, coeffs, rxn.upper_bound, score, False)) + if rxn.lower_bound < 0: # reverse direction as its own non-negative flux + directed.append( + _Directed(f"{rxn.id}__rev", rxn.id, rev_coeffs, -rxn.lower_bound, score, False) + ) + return directed + + +def run_init( + model: cobra.Model, + rxn_scores: Mapping[str, float] | None = None, + *, + present_mets: Iterable[str] | None = None, + essential_rxns: Iterable[str] | None = None, + prod_weight: float = 0.5, + allow_excretion: bool = False, + no_rev_loops: bool = False, + eps: float = _EPS, + big_m: float | None = None, + mip_gap: float | None = None, + time_limit: float | None = None, +) -> InitResult: + """Run the INIT MILP and return the extracted model. + + ``rxn_scores`` maps reaction id → score (default 0). ``essential_rxns`` must be + kept (forced to carry flux). ``present_mets`` are metabolite *names* that the + network should be able to produce; each is tested and reported in + ``met_production``. See the module docstring for the formulation. + + Note on score 0 (classic INIT vs. ftINIT divergence): in classic INIT a + reaction with score exactly 0 receives an include-indicator with **zero + reward**, so the optimiser is free to drop it. This matches RAVEN's + `runINIT` semantics. ftINIT inverts that — score-0 reactions stay in the + model unless they actively hurt feasibility — so a score of exactly 0 + means *different things* in the two variants. If you want score-0 + reactions kept here, pass a small positive value (e.g. ``min_score`` from + `gene_scores_from_expression`) instead of 0. + """ + scores = dict(rxn_scores or {}) + essential = set(essential_rxns or []) + present = list(present_mets or []) + + directed = _split_reactions(model, scores, essential) + prob = model.problem + opt = prob.Model() + + # Flux variables for every directed reaction. + flux = {d.key: prob.Variable(f"v_{d.key}", lb=0.0, ub=d.ub) for d in directed} + + # Binary include-indicators for non-essential reactions; eps*x <= v <= ub*x. + keep: dict[str, object] = {} + gates = [] + for d in directed: + if d.essential: + flux[d.key].lb = max(eps, 0.0) # forced to carry flux + continue + x = prob.Variable(f"x_{d.key}", type="binary") + keep[d.key] = x + cap = d.ub if big_m is None else big_m # big-M: per-reaction bound (default) or fixed + gates.append(prob.Constraint(flux[d.key] - cap * x, ub=0.0, name=f"ub_{d.key}")) + gates.append(prob.Constraint(flux[d.key] - eps * x, lb=0.0, name=f"lb_{d.key}")) + + # no_rev_loops: at most one direction of a reversible reaction is included. + by_origin: dict[str, list[str]] = {} + for d in directed: + by_origin.setdefault(d.origin, []).append(d.key) + if no_rev_loops: + for keys in by_origin.values(): + xs = [keep[k] for k in keys if k in keep] + if len(xs) > 1: + gates.append(prob.Constraint(sum(xs), ub=1.0, name=f"onedir_{keys[0]}")) + + # Steady-state constraints S·v (- sink) {==0 | >=0}, plus prod_weight sinks. + # Accumulate each metabolite's terms by iterating reactions once (avoids the + # O(mets·rxns) per-metabolite filter) and sum with optlang.symbolics.add — Python + # sum() re-canonicalises a growing sympy expression each step (O(n²)), which is + # minutes per hub metabolite at genome scale. + met_terms: dict[str, list] = {met.id: [] for met in model.metabolites} + for d in directed: + v = flux[d.key] + for mid, coeff in d.coeffs.items(): + met_terms[mid].append(mul([Real(coeff), v])) + + sinks: dict[str, object] = {} + met_constraints: dict[str, object] = {} + ub = None if allow_excretion else 0.0 + for met in model.metabolites: + terms = met_terms[met.id] + if prod_weight != 0: + s = prob.Variable(f"s_{met.id}", lb=0.0, ub=1.0) + sinks[met.id] = s + terms = [*terms, mul([Real(-1.0), s])] # net production drained into rewarded sink + if terms: + met_constraints[met.id] = prob.Constraint(add(terms), lb=0.0, ub=ub) + + opt.add(list(flux.values()) + list(keep.values()) + list(sinks.values()) + + gates + list(met_constraints.values())) + + objective = prob.Objective( + add([mul([Real(d.score), keep[d.key]]) for d in directed if d.key in keep] + + [mul([Real(prod_weight), s]) for s in sinks.values()]), + direction="max", + ) + opt.objective = objective + + met_production = _check_present_mets(prob, present, model, directed, allow_excretion) + + if time_limit is not None: + opt.configuration.timeout = int(time_limit) + if mip_gap is not None: + try: # Gurobi-specific; harmless if the backend differs + opt.problem.Params.MIPGap = mip_gap + except Exception: # noqa: BLE001 + pass + opt.optimize() + # With a MIP gap / time limit set, accept a near-optimal incumbent (as RAVEN does). + if opt.status not in ("optimal", "feasible", "suboptimal", "time_limit"): + raise RuntimeError(f"INIT MILP did not solve (status: {opt.status}).") + + # A reaction is kept if any of its directed parts is essential or has x≈1. + kept_origins = {d.origin for d in directed if d.essential} + kept_origins |= {d.origin for d in directed if d.key in keep and (keep[d.key].primal or 0) > 0.5} + deleted = [r.id for r in model.reactions if r.id not in kept_origins] + + out = model.copy() + out.remove_reactions(deleted, remove_orphans=True) + return InitResult(out, sorted(deleted), met_production, float(opt.objective.value)) + + +def _check_present_mets(prob, present, model, directed, allow_excretion) -> dict[str, bool]: + """Whether each present metabolite (by name) can be net-produced at all. + + A small LP per metabolite (no score/binary, so it's the LP relaxation, as RAVEN + does): all reactions available, steady state, and a demand draining ≥1 unit of + any compartment form of the metabolite — feasible ⇔ producible. + """ + if not present: + return {} + name_to_ids: dict[str, list[str]] = {} + for met in model.metabolites: + name_to_ids.setdefault((met.name or met.id).upper(), []).append(met.id) + + result: dict[str, bool] = {} + for name in present: + ids = name_to_ids.get(name.upper()) + if not ids: + result[name] = False + continue + lp = prob.Model() + flux = {d.key: prob.Variable(f"v_{d.key}", lb=0.0, ub=d.ub) for d in directed} + drains = {mid: prob.Variable(f"drain_{mid}", lb=0.0, ub=1e6) for mid in ids} + terms: dict[str, list] = {met.id: [] for met in model.metabolites} + for d in directed: + v = flux[d.key] + for mid, c in d.coeffs.items(): + terms[mid].append(mul([Real(c), v])) + for mid in drains: + terms[mid].append(mul([Real(-1.0), drains[mid]])) + cons = [prob.Constraint(add(t), lb=0.0, ub=None if allow_excretion else 0.0) + for t in terms.values() if t] + require = prob.Constraint(add(list(drains.values())), lb=1.0, name="_require_production") + lp.add(list(flux.values()) + list(drains.values()) + cons + [require]) + lp.objective = prob.Objective(prob.Variable("_zero", lb=0, ub=0), direction="max") + lp.optimize() + result[name] = lp.status == "optimal" + return result diff --git a/src/raven_python/init/merge.py b/src/raven_python/init/merge.py new file mode 100644 index 0000000..a26f41c --- /dev/null +++ b/src/raven_python/init/merge.py @@ -0,0 +1,226 @@ +"""Linear reaction merging for ftINIT. + +ftINIT shrinks the MILP losslessly by **contracting linear reaction chains**: a +metabolite that appears in exactly two reactions (one net producer, one net consumer) +links them into a single combined reaction. Iterating this collapses unbranched +pathways — on Human-GEM ~12k → ~8k reactions, a ~⅓ smaller MILP — without changing +the feasible flux space. Reversible reactions may merge too (unlike +``simplifyModel``'s merge), which is why ftINIT ships its own. + +:func:`merge_linear` returns the reduced model plus the bookkeeping needed to map +scores and results back to the original reactions: + +* ``group_ids`` — one integer per original reaction; ``0`` = not merged, equal + non-zero integers = merged into the same combined reaction (which keeps one + member's id). +* ``reversed_rxns`` — which originals were flipped (their stored direction negated) + when oriented for merging; needed to map fluxes/directions back. + +:func:`group_rxn_scores` then sums the original per-reaction scores over each group, +with RAVEN's zero-handling (see its docstring): genuine 0 → 0.01, ignore-masked → 0, +a group cancelling to 0 with non-zero members → 0.01 — all so the MILP never sees an +exactly-zero score (whose on/off would be arbitrary). +""" +from __future__ import annotations + +import math +from collections import defaultdict +from collections.abc import Iterable, Mapping + +import cobra + +_TOL = 1e-12 + + +class _Rxn: + """Mutable working reaction during the merge.""" + + __slots__ = ("id", "name", "coeffs", "lb", "ub") + + def __init__(self, rid, name, coeffs, lb, ub): + self.id, self.name, self.coeffs, self.lb, self.ub = rid, name, coeffs, lb, ub + + @property + def reversible(self) -> bool: # RAVEN's rev flag ≡ a negative lower bound + return self.lb < 0 + + +def merge_linear( + model: cobra.Model, no_merge: Iterable[str] = () +) -> tuple[cobra.Model, list[str], list[int], list[bool]]: + """Merge linearly-dependent reactions; return ``(reduced, orig_ids, group_ids, reversed)``. + + ``no_merge`` reaction ids are never merged. The reduced model carries no genes + (merging makes GPRs meaningless); scores are remapped with + :func:`group_rxn_scores`. + + Each pass recomputes the metabolite→reaction incidence fresh, then merges over the + degree-2 metabolites found at the start of the pass. A metabolite that only + *becomes* degree-2 mid-pass (because one of its reactions was just merged into a + survivor) is therefore picked up on the next pass rather than immediately — linear + merging is confluent, so the final grouping is the same regardless, it just takes a + few extra passes on long chains. (RAVEN re-finds incidence per metabolite and so + finishes a chain in one pass; the end result is equivalent.) + """ + banned = set(no_merge) + orig_ids = [r.id for r in model.reactions] + group_of: dict[str, int] = {rid: 0 for rid in orig_ids} + reversed_of: dict[str, bool] = {rid: False for rid in orig_ids} + next_group = 1 + + rxns = [ + _Rxn(r.id, r.name, {m.id: c for m, c in r.metabolites.items()}, + r.lower_bound, r.upper_bound) + for r in model.reactions + ] + + def flip(rx: _Rxn) -> None: + rx.coeffs = {m: -c for m, c in rx.coeffs.items()} + rx.lb, rx.ub = -rx.ub, -rx.lb + grp = group_of[rx.id] + targets = [o for o in orig_ids if group_of[o] == grp] if grp else [rx.id] + for o in targets: + reversed_of[o] = not reversed_of[o] + + def relabel(rx: _Rxn, grp: int) -> None: + old = group_of[rx.id] + if old == grp: + return + if old == 0: + group_of[rx.id] = grp + else: + for o in orig_ids: + if group_of[o] == old: + group_of[o] = grp + + while True: + incidence: dict[str, list[int]] = defaultdict(list) + for i, rx in enumerate(rxns): + for m in rx.coeffs: + incidence[m].append(i) + degree2 = [m for m, ii in incidence.items() if len(ii) == 2] + + merged_some = False + for met in degree2: + involved = [i for i in incidence[met] if met in rxns[i].coeffs] + if len(involved) != 2: + continue # one side already merged away this pass + a, b = involved + if rxns[a].id in banned or rxns[b].id in banned: + continue + ca, cb = rxns[a].coeffs[met], rxns[b].coeffs[met] + ra, rb = rxns[a].reversible, rxns[b].reversible + pos = (ca > 0 or ra) + (cb > 0 or rb) + neg = (ca < 0 or ra) + (cb < 0 or rb) + if pos < 1 or neg < 1: + continue # need one producer and one consumer + + r1, r2 = a, b + # Special case: rev producer first, irrev producer second → swap (RAVEN l.74). + if rxns[r1].reversible and not rxns[r2].reversible \ + and rxns[r1].coeffs[met] > 0 and rxns[r2].coeffs[met] > 0: + r1, r2 = r2, r1 + # Make r1 the producer of `met`. + if rxns[r1].coeffs[met] < 0: + if rxns[r2].coeffs[met] > 0: + r1, r2 = r2, r1 + elif rxns[r1].reversible: + flip(rxns[r1]) + elif rxns[r2].reversible: + flip(rxns[r2]) + r1, r2 = r2, r1 + else: + raise RuntimeError("mergeLinear: no producer orientation possible.") + # Make r2 the consumer. + if rxns[r2].coeffs[met] > 0: + if rxns[r2].reversible: + flip(rxns[r2]) + else: + raise RuntimeError("mergeLinear: no consumer orientation possible.") + + ratio = abs(rxns[r1].coeffs[met] / rxns[r2].coeffs[met]) + merged = defaultdict(float, rxns[r1].coeffs) + for m, c in rxns[r2].coeffs.items(): + merged[m] += c * ratio + merged[met] = 0.0 + rxns[r1].coeffs = {m: c for m, c in merged.items() if abs(c) > _TOL} + + # Most-constraining bounds win (RAVEN scales r2's bounds by the ratio). + if not math.isinf(rxns[r2].lb): + rxns[r1].lb = max(rxns[r1].lb, rxns[r2].lb / ratio) + if not math.isinf(rxns[r2].ub): + rxns[r1].ub = min(rxns[r1].ub, rxns[r2].ub / ratio) + rxns[r2].coeffs = {} # cleared → removed after the pass + + grp = max(group_of[rxns[r1].id], group_of[rxns[r2].id]) or next_group + if grp == next_group: + next_group += 1 + relabel(rxns[r1], grp) + relabel(rxns[r2], grp) + merged_some = True + + if not merged_some: + break + rxns = [rx for rx in rxns if rx.coeffs] + + return _build_model(model, rxns), orig_ids, [group_of[o] for o in orig_ids], \ + [reversed_of[o] for o in orig_ids] + + +def _build_model(template: cobra.Model, rxns: list[_Rxn]) -> cobra.Model: + """Assemble the reduced cobra model (gene-free) from the merged working reactions.""" + reduced = cobra.Model(template.id) + used = {m for rx in rxns for m in rx.coeffs} + reduced.add_metabolites([ + cobra.Metabolite(m.id, name=m.name, compartment=m.compartment, formula=m.formula) + for m in template.metabolites if m.id in used # template order preserved + ]) + new_rxns = [] + for rx in rxns: + r = cobra.Reaction(rx.id, name=rx.name, lower_bound=rx.lb, upper_bound=rx.ub) + new_rxns.append(r) + reduced.add_reactions(new_rxns) + for rx, r in zip(rxns, new_rxns, strict=True): + r.add_metabolites({reduced.metabolites.get_by_id(m): c for m, c in rx.coeffs.items()}) + return reduced + + +def group_rxn_scores( + reduced_model: cobra.Model, + orig_scores: Mapping[str, float], + orig_rxn_ids: list[str], + group_ids: list[int], + to_zero: Iterable[str] = (), +) -> dict[str, float]: + """Sum original reaction scores over merged groups (RAVEN ``groupRxnScores``). + + ``orig_scores`` maps original reaction id → score; ``to_zero`` are reactions to + drop from the problem (the ``toIgnore`` masks) — their score becomes 0. Genuine + zeros and groups cancelling to zero become 0.01 so the MILP never sees an exactly + zero score. Returns ``{reduced_reaction_id: score}``. + """ + zero = set(to_zero) + group_of = dict(zip(orig_rxn_ids, group_ids, strict=True)) + # Per-original adjusted score: genuine 0 → 0.01, then ignore-masked → 0. + adj: dict[str, float] = {} + for rid in orig_rxn_ids: + s = float(orig_scores.get(rid, 0.0)) + s = 0.01 if s == 0.0 else s + adj[rid] = 0.0 if rid in zero else s + members: dict[int, list[str]] = defaultdict(list) + for rid in orig_rxn_ids: + if group_of[rid] != 0: # only merged groups need member lists + members[group_of[rid]].append(rid) + + scores: dict[str, float] = {} + for r in reduced_model.reactions: + grp = group_of[r.id] + if grp == 0: # unmerged: keep the reaction's own (adjusted) score + scores[r.id] = adj[r.id] + else: + group = members[grp] + total = sum(adj[m] for m in group) + if total == 0.0 and any(adj[m] != 0.0 for m in group): + total = 0.01 # cancelled to zero but had non-zero members + scores[r.id] = total + return scores diff --git a/src/raven_python/init/prep.py b/src/raven_python/init/prep.py new file mode 100644 index 0000000..8ed4b89 --- /dev/null +++ b/src/raven_python/init/prep.py @@ -0,0 +1,241 @@ +"""ftINIT preprocessing — once-per-template work shared by every sample on a model. + +ftINIT does all omics-independent work once: classify reactions into the categories +the staged MILP may *ignore* (leave in, never remove), discover task-essential +reactions, linearly merge, and scale. The result (:class:`PrepData`) is reused across +every sample. + +:func:`classify_reactions` is the reaction taxonomy: exchange, GPR-less +import / simple / advanced transport, spontaneous, GPR-less extracellular, custom, and +"any without a GPR". The staged schedule (:func:`raven_python.init.get_init_steps`) selects +which categories to keep out of each MILP step via an 8-bit pattern. +""" +from __future__ import annotations + +from collections.abc import Iterable +from dataclasses import dataclass, field + +import cobra + +from raven_python.init.merge import merge_linear +from raven_python.tasks import Task, find_task_essential_reactions + + +@dataclass +class ReactionMasks: + """Reaction-category id sets (RAVEN's ``toIgnore*``), in 8-bit-pattern order. + + ``ignored(pattern)`` returns the union of the categories whose bit is set — the + reactions held out of (left untouched by) that MILP step. + """ + + exchange: set[str] = field(default_factory=set) # b1 + import_rxns: set[str] = field(default_factory=set) # b2 + simple_transport: set[str] = field(default_factory=set) # b3 + advanced_transport: set[str] = field(default_factory=set) # b4 + spontaneous: set[str] = field(default_factory=set) # b5 + extracellular: set[str] = field(default_factory=set) # b6 (no-GPR, all mets in ext comp) + custom: set[str] = field(default_factory=set) # b7 + no_gpr: set[str] = field(default_factory=set) # b8 + + def _ordered(self) -> list[set[str]]: + return [self.exchange, self.import_rxns, self.simple_transport, + self.advanced_transport, self.spontaneous, self.extracellular, + self.custom, self.no_gpr] + + def ignored(self, pattern: Iterable[int]) -> set[str]: + out: set[str] = set() + for bit, group in zip(pattern, self._ordered(), strict=True): + if bit: + out |= group + return out + + +def _is_advanced_transport(rxn: cobra.Reaction) -> bool: + """Even number (>2) of mets pairing up by name across compartments with canceling stoich.""" + mets = list(rxn.metabolites.items()) + if len(mets) <= 2 or len(mets) % 2 != 0: + return False + remaining = [(m.name, m.compartment, c) for m, c in mets] + while remaining: + name, comp, coeff = remaining[0] + matches = [i for i in range(1, len(remaining)) if remaining[i][0] == name] + if len(matches) != 1: + return False + j = matches[0] + if coeff + remaining[j][2] != 0 or comp == remaining[j][1]: + return False + remaining = [r for k, r in enumerate(remaining) if k not in (0, j)] + return True + + +def classify_reactions( + model: cobra.Model, + *, + ext_comp: str = "e", + spontaneous: Iterable[str] = (), + custom: Iterable[str] = (), +) -> ReactionMasks: + """Classify reactions into the ftINIT ``toIgnore`` categories (``prepINITModel``). + + ``ext_comp`` is the extracellular compartment. ``spontaneous``/``custom`` are + reaction-id lists. A reaction is "GPR-less" when its gene rule is empty. + """ + spont, cust = set(spontaneous), set(custom) + masks = ReactionMasks( + exchange={r.id for r in model.boundary}, + spontaneous={r.id for r in model.reactions if r.id in spont}, + custom={r.id for r in model.reactions if r.id in cust}, + no_gpr={r.id for r in model.reactions if not r.gene_reaction_rule.strip()}, + ) + for rxn in model.reactions: + if rxn.gene_reaction_rule.strip(): + continue # transport categories are GPR-less only + mets = list(rxn.metabolites) + if len(mets) == 2: + (m1, m2) = mets + if m1.compartment != m2.compartment and m1.name == m2.name: + if ext_comp in (m1.compartment, m2.compartment): + masks.import_rxns.add(rxn.id) + else: + masks.simple_transport.add(rxn.id) + elif _is_advanced_transport(rxn): + masks.advanced_transport.add(rxn.id) + if len(mets) > 1 and all(m.compartment == ext_comp for m in mets): + masks.extracellular.add(rxn.id) + return masks + + +@dataclass +class PrepData: + """One-time ftINIT preprocessing of a template model (RAVEN ``prepData``). + + Built once per template, reused across samples. ``min_model`` is the merged model + the MILP runs on; ``orig_rxn_ids``/``group_ids`` map its reactions back to the + ``ref_model`` (the simplified, pre-merge reference). ``essential_rxns`` are in + **merged** ids and pre-oriented irreversibly (so the MILP forces flux *forward*). + ``masks`` is on ``ref_model`` (= original) ids. + """ + + ref_model: cobra.Model + min_model: cobra.Model + orig_rxn_ids: list[str] + group_ids: list[int] + reversed_rxns: list[bool] + masks: ReactionMasks + essential_rxns: set[str] = field(default_factory=set) + essential_mets_for_tasks: set[str] = field(default_factory=set) + tasks: list[Task] = field(default_factory=list) + + @property + def group_of(self) -> dict[str, int]: + return dict(zip(self.orig_rxn_ids, self.group_ids, strict=True)) + + +def rescale_for_init(model: cobra.Model, max_stoich_diff: float = 25.0) -> None: + """Compress each reaction's stoichiometric dynamic range. + + Large spreads in stoichiometric coefficients (e.g. a biomass/pool reaction with + coefficients from 1e-3 to 1e2) force correspondingly extreme flux magnitudes, so no + single MILP big-M fits all reactions. RAVEN, per reaction: caps every ``|coeff|`` at + ``max_stoich_diff × min|coeff|`` (keeping signs), then scales the whole reaction so its + mean ``|coeff|`` is 1. Bounds are reset to ``±1000`` afterwards. Modifies ``model`` in + place; only the merged MILP model is scaled (the final output maps back to the + unscaled ``ref_model`` by reaction id, so reaction *selection* is unaffected). + """ + for rxn in model.reactions: + items = list(rxn.metabolites.items()) + if not items: + continue + cap = max_stoich_diff * min(abs(c) for _, c in items) + capped = {m: ((cap if c > 0 else -cap) if abs(c) > cap else c) for m, c in items} + total = sum(abs(c) for c in capped.values()) + scale = (len(capped) / total) if total else 1.0 + rxn.add_metabolites({m: c * scale for m, c in capped.items()}, combine=False) + for rxn in model.reactions: # RAVEN resets bounds to the standard ±1000 after scaling + if rxn.upper_bound > 0: + rxn.upper_bound = 1000.0 + if rxn.lower_bound < 0: + rxn.lower_bound = -1000.0 + + +def _orient_forward(rxn: cobra.Reaction, direction: int) -> None: + """Make ``rxn`` carry flux only in its forced direction (irreversible forward).""" + if direction < 0: # flip so the forced (reverse) direction becomes forward + rxn.add_metabolites({m: -2 * c for m, c in rxn.metabolites.items()}) + rxn.bounds = (-rxn.upper_bound, -rxn.lower_bound) + rxn.lower_bound = max(rxn.lower_bound, 0.0) + + +def prep_init_model( + template: cobra.Model, + tasks: Iterable[Task] | None = None, + *, + ext_comp: str = "e", + spontaneous: Iterable[str] = (), + custom: Iterable[str] = (), + essential_cache_path=None, + scale: bool = True, +) -> PrepData: + """Build :class:`PrepData` from a template model — the once-per-template work shared + by every ftINIT sample on this model. + + With ``tasks``, discovers the task-essential reactions (kept regardless of score), + orients them irreversibly in their required direction, and drops tasks that are + infeasible. Then classifies reactions into the omics-independent categories, linearly + merges, and (unless ``scale=False``) rescales the merged model's stoichiometry + (:func:`rescale_for_init`) so a single MILP big-M is valid across all reactions — + without this, genome-scale ftINIT is infeasible / intractable. + + ``essential_cache_path`` makes the (slow, genome-scale) essential-reaction discovery + **resumable** across interruptions — see :func:`find_task_essential_reactions`. + """ + ref_model = template.copy() + + essential_pre: dict[str, int] = {} + task_mets: set[str] = set() + kept_tasks: list[Task] = [] + if tasks is not None: + tasks = list(tasks) + ess = find_task_essential_reactions(ref_model, tasks, cache_path=essential_cache_path) + essential_pre = ess.reactions + task_mets = ess.task_metabolites + kept_tasks = [t for t in tasks if t.id not in ess.failed_tasks] + + # Orient essentials irreversibly (forced direction → forward) before merging, so + # the merge keeps them forward and the MILP forces them with a simple lower bound. + for rid, direction in essential_pre.items(): + _orient_forward(ref_model.reactions.get_by_id(rid), direction) + + masks = classify_reactions(ref_model, ext_comp=ext_comp, + spontaneous=spontaneous, custom=custom) + + min_model, orig_ids, group_ids, reversed_rxns = merge_linear(ref_model) + if scale: # compress stoichiometric dynamic range so the MILP big-M fits all reactions + rescale_for_init(min_model) + group_of = dict(zip(orig_ids, group_ids, strict=True)) + + # Map essentials to the merged model: the survivor of each group containing an + # essential (or the reaction itself if unmerged). All are forward after orientation. + # An essential that merged into a group which collapsed away (e.g. a trivial + # source→sink chain) has no survivor and imposes no constraint — skip it. + survivor_by_group = {group_of[r.id]: r.id for r in min_model.reactions if group_of[r.id]} + essential_merged: set[str] = set() + for rid in essential_pre: + gid = group_of[rid] + if gid == 0: + essential_merged.add(rid) + elif gid in survivor_by_group: + essential_merged.add(survivor_by_group[gid]) + + return PrepData( + ref_model=ref_model, + min_model=min_model, + orig_rxn_ids=orig_ids, + group_ids=group_ids, + reversed_rxns=reversed_rxns, + masks=masks, + essential_rxns=essential_merged, + essential_mets_for_tasks=task_mets, + tasks=kept_tasks, + ) diff --git a/src/raven_python/init/score.py b/src/raven_python/init/score.py new file mode 100644 index 0000000..6e14f86 --- /dev/null +++ b/src/raven_python/init/score.py @@ -0,0 +1,86 @@ +"""Score reactions from gene scores via the GPR. + +Maps per-gene scores (e.g. expression-derived: present → positive, absent → negative) +to per-reaction scores by walking each reaction's GPR: genes joined by **OR** +(isozymes) are combined with ``isozyme_scoring`` (default ``max``); genes joined by +**AND** (complexes) with ``complex_scoring`` (default ``min``). Genes missing from +``gene_scores`` are *omitted*; a reaction with no genes — or whose genes are all +missing — gets ``no_gene_score`` (default −2). These reaction scores feed +:func:`raven_python.init.run_init` and :func:`raven_python.init.ftinit`. + +Upstream — the omics-data → gene-score step (thresholding, expression levels) — lives +in :mod:`raven_python.omics`; this function takes gene scores as given. +""" +from __future__ import annotations + +import ast +import math +import statistics +from collections.abc import Mapping + +import cobra + +_AGG = {"min": min, "max": max, "median": statistics.median, "average": statistics.fmean} + + +def gene_scores_from_expression( + expression: Mapping[str, float], + reference: Mapping[str, float] | float, + *, + factor: float = 5.0, + max_score: float = 10.0, + min_score: float = -5.0, +) -> dict[str, float]: + """Gene scores from RNA-seq/array expression, RAVEN's ``5·ln(level/reference)``. + + This is tINIT's usual entry point (RNA-seq is the common case; single-cell and + HPA are alternative upstream sources). ``reference`` is either a per-gene + reference level (e.g. the cross-sample mean) or a single threshold for all genes: + a gene expressed above its reference scores positive, below it negative. The + score is clamped to ``[min_score, max_score]``; non-positive level/reference (and + missing reference) → ``min_score`` (RAVEN maps these NaNs to -5). + """ + scalar = isinstance(reference, (int, float)) + scores: dict[str, float] = {} + for gene, level in expression.items(): + ref = reference if scalar else reference.get(gene) + if not level or not ref or level <= 0 or ref <= 0: + scores[gene] = min_score + else: + scores[gene] = max(min(factor * math.log(level / ref), max_score), min_score) + return scores + + +def _score_node(node, gene_scores: Mapping[str, float], iso, cplx) -> float | None: + if isinstance(node, ast.Name): + return gene_scores.get(node.id) # None if the gene has no score + if isinstance(node, ast.BoolOp): + agg = iso if isinstance(node.op, ast.Or) else cplx + vals = [s for v in node.values if (s := _score_node(v, gene_scores, iso, cplx)) is not None] + return agg(vals) if vals else None + return None + + +def score_reactions_from_genes( + model: cobra.Model, + gene_scores: Mapping[str, float], + *, + isozyme_scoring: str = "max", + complex_scoring: str = "min", + no_gene_score: float = -2.0, +) -> dict[str, float]: + """Return ``{reaction_id: score}`` from per-gene scores via each reaction's GPR.""" + for name, value in (("isozyme_scoring", isozyme_scoring), ("complex_scoring", complex_scoring)): + if value not in _AGG: + raise ValueError(f"{name} must be one of {sorted(_AGG)}; got {value!r}.") + iso, cplx = _AGG[isozyme_scoring], _AGG[complex_scoring] + + scores: dict[str, float] = {} + for rxn in model.reactions: + body = rxn.gpr.body + if body is None or not rxn.genes: + scores[rxn.id] = no_gene_score + else: + value = _score_node(body, gene_scores, iso, cplx) + scores[rxn.id] = no_gene_score if value is None else float(value) + return scores diff --git a/src/raven_python/init/steps.py b/src/raven_python/init/steps.py new file mode 100644 index 0000000..d8a7b86 --- /dev/null +++ b/src/raven_python/init/steps.py @@ -0,0 +1,62 @@ +"""ftINIT step schedule. + +ftINIT runs as a short sequence of MILP steps instead of one big MILP. Each step +(:class:`InitStep`) chooses which reaction categories to hold out of the problem +(``ignore_mask``, an 8-bit pattern over :class:`raven_python.init.ReactionMasks`), whether +to drop positive reversibles and allow metabolite secretion, and how to treat the +reactions turned on by previous steps (``'ignore'`` for the first step, ``'essential'`` +to fix them on). :func:`get_init_steps` builds the standard schedules. + +The default ``'1+1'`` is two steps: step 1 decides only the GPR-associated reactions +(everything GPR-less is held out); step 2 brings the GPR-less transport / extracellular +reactions in with step-1 reactions fixed as essential. ``'full'`` is the single-MILP +classic-tINIT variant (nothing held out). +""" +from __future__ import annotations + +from collections.abc import Sequence +from dataclasses import dataclass, field + +# 8-bit ignore patterns (exchange, import, simple-transp, adv-transp, spontaneous, +# extracellular, custom, no-GPR) — see ReactionMasks. +_ALL_NO_GPR_KEPT = (1, 1, 1, 1, 1, 1, 1, 0) # hold out every GPR-less category but "all no-GPR" +_EXCH_SPONT = (1, 0, 0, 0, 1, 0, 0, 0) # hold out only exchange + spontaneous +_NONE = (0, 0, 0, 0, 0, 0, 0, 0) + + +@dataclass +class InitStep: + """One ftINIT MILP step.""" + + how_to_use_prev: str = "essential" # 'ignore' | 'essential' + ignore_mask: tuple[int, ...] = _ALL_NO_GPR_KEPT + pos_rev_off: bool = False # drop positive reversibles from the problem + allow_met_secr: bool = False # relax S·v = 0 to ≥ 0 + mets_to_ignore: Sequence[str] = field(default_factory=tuple) # met names zeroed from S (e.g. H2O) + + +def get_init_steps(series: str = "1+1", *, mets_to_ignore: Sequence[str] = ()) -> list[InitStep]: + """Return the step schedule for a named ftINIT ``series`` (RAVEN ``getINITSteps``). + + ``'1+1'`` (default, step 1+2 merged), ``'2+1'`` (3-step), ``'1+0'``/``'2+0'`` + (skip the final GPR-less step), ``'full'`` (single MILP). ``mets_to_ignore`` are + metabolite names removed from the stoichiometry in each step (e.g. H2O, H+). + """ + m = tuple(mets_to_ignore) + s1 = InitStep("ignore", _ALL_NO_GPR_KEPT, mets_to_ignore=m) + s1_posrev = InitStep("ignore", _ALL_NO_GPR_KEPT, pos_rev_off=True, allow_met_secr=True, + mets_to_ignore=m) + s2_all = InitStep("essential", _ALL_NO_GPR_KEPT, mets_to_ignore=m) + s_final = InitStep("essential", _EXCH_SPONT, mets_to_ignore=m) + + if series == "1+1": + return [s1, s_final] + if series == "2+1": + return [s1_posrev, s2_all, s_final] + if series == "1+0": + return [s1] + if series == "2+0": + return [s1_posrev, s2_all] + if series == "full": + return [InitStep("ignore", _NONE, mets_to_ignore=m)] + raise ValueError(f"Unknown ftINIT series {series!r}; expected 1+1, 2+1, 1+0, 2+0, full.") diff --git a/src/raven_python/init/taskfill.py b/src/raven_python/init/taskfill.py new file mode 100644 index 0000000..58501ce --- /dev/null +++ b/src/raven_python/init/taskfill.py @@ -0,0 +1,183 @@ +"""Task gap-filling for ftINIT. + +After ftINIT extracts a context-specific model, some metabolic tasks may no longer be +feasible (the scoring removed reactions a task needs). :func:`fill_tasks` restores +feasibility by adding back the **minimum-cost** set of reactions from the reference +(template) model — cost = ``−score``, so high-scoring reactions are preferred — one +task at a time, only for tasks that are actually infeasible (a cheap LP check gates +the expensive MILP), accumulating additions across tasks. + +This is a different MILP from ftINIT's main extraction: it *adds* reactions to satisfy +the task's ranged metabolite bounds (RAVEN's two-column ``b``), rather than selecting +which to keep by expression score. Exchange reactions are not used to fill gaps (task +inputs/outputs come from the task's ``b``), so they are excluded as candidates. +""" +from __future__ import annotations + +from collections.abc import Iterable, Mapping +from dataclasses import dataclass + +import cobra +from optlang.symbolics import Real, add, mul + +from raven_python.tasks import Task +from raven_python.tasks.check import ( + _metabolite_bounds, + _set_constraint_bounds, + apply_task_constraints, + task_name_maps, +) + +_DEFAULT_SCORE = -1.0 # RAVEN: missing scores default to -1 (cost 1) +_MAX_SCORE = -0.1 # RAVEN min(score, -0.1): every added reaction costs ≥ 0.1 + + +@dataclass +class TaskFillResult: + """Result of :func:`fill_tasks`: the gap-filled model and what was added.""" + + model: cobra.Model + added_reactions: list[str] + failed_tasks: list[str] + + +def _closed_copy(model: cobra.Model) -> cobra.Model: + """A copy with boundary reactions closed: task I/O comes only from the task's b.""" + out = model.copy() + for rxn in out.boundary: + rxn.bounds = (0.0, 0.0) + return out + + +def _feasible(model: cobra.Model, task: Task, name_to_id, comp_to_ids) -> bool: + """Is ``task`` feasible in ``model`` (boundaries closed)? Tested in place, then reverted. + + Avoids copying the (genome-scale) model for each of the task list's feasibility checks + — the copy dominated gap-fill runtime. ``with model:`` reverts the closed boundaries and + everything ``apply_task_constraints`` does through cobra's API; the untracked direct + metabolite mass-balance bound edits are snapshotted and restored (as in check_tasks). + """ + bounds, missing = _metabolite_bounds(task, name_to_id, comp_to_ids) + if missing: + return False + saved = {mid: (model.constraints[mid].lb, model.constraints[mid].ub) for mid in bounds} + try: + with model: + for rxn in model.boundary: + rxn.bounds = (0.0, 0.0) + _, error = apply_task_constraints(model, task, name_to_id, comp_to_ids) + if error is not None: + return False + model.slim_optimize() + return model.solver.status == "optimal" + finally: + for mid, (lb, ub) in saved.items(): + _set_constraint_bounds(model.constraints[mid], lb, ub) + + +def _fill_one_task( + model: cobra.Model, candidates: list[cobra.Reaction], task: Task, + costs: dict[str, float], *, mip_gap: float | None = None, time_limit: float | None = None, +) -> list[str]: + """Min-cost set of ``candidates`` to make ``task`` feasible in ``model`` (the MILP). + + ``mip_gap``/``time_limit`` bound this MILP (it has a binary per candidate reaction — + thousands). Unbounded, proving min-cost optimality is intractable when degraded input + has broken many tasks at once; a near-optimal fill (slightly more reactions) is the + right trade for tractability, exactly as for the main ftINIT MILP. + """ + if not candidates: # nothing left to add → task cannot be made feasible + raise RuntimeError(f"gap-filling found no candidates for task {task.id!r}.") + combined = _closed_copy(model) # task I/O via the task's b, not the model's exchanges + combined.add_reactions([r.copy() for r in candidates]) + name_to_id, comp_to_ids = task_name_maps(combined) + _, error = apply_task_constraints(combined, task, name_to_id, comp_to_ids) + if error is not None: + raise RuntimeError(f"task {task.id!r} could not be applied to the reference: {error}") + + prob = combined.problem + extras = [] + objective_terms = [] + for cand in candidates: + rxn = combined.reactions.get_by_id(cand.id) + y = prob.Variable(f"_fill_{cand.id}", type="binary") + # off ⇒ no flux; on ⇒ the reaction's own bounds apply. + extras += [ + y, + prob.Constraint(rxn.flux_expression - rxn.upper_bound * y, ub=0.0, + name=f"_fillub_{cand.id}"), + prob.Constraint(rxn.flux_expression - rxn.lower_bound * y, lb=0.0, + name=f"_filllb_{cand.id}"), + ] + objective_terms.append(mul([Real(costs[cand.id]), y])) + combined.add_cons_vars(extras) + # add() over a flat list, not Python sum() — the latter is O(n²) in sympy and with + # thousands of candidates dominates gap-fill runtime (see ftINIT/tINIT, same fix). + combined.objective = prob.Objective(add(objective_terms), direction="min") + if time_limit is not None: + combined.solver.configuration.timeout = int(time_limit) + if mip_gap is not None: + try: # Gurobi-specific; harmless if the backend differs + combined.solver.problem.Params.MIPGap = mip_gap + except Exception: # noqa: BLE001 + pass + combined.slim_optimize() + # Accept a near-optimal incumbent (mip_gap/time_limit); only a truly infeasible fill + # (no incumbent) means the task cannot be satisfied from the reference. + if combined.solver.status not in ("optimal", "feasible", "suboptimal", "time_limit") or \ + combined.variables[f"_fill_{candidates[0].id}"].primal is None: + raise RuntimeError(f"gap-filling found no way to make task {task.id!r} feasible.") + return [c.id for c in candidates + if (combined.variables[f"_fill_{c.id}"].primal or 0.0) > 0.5] + + +def fill_tasks( + model: cobra.Model, + reference_model: cobra.Model, + tasks: Iterable[Task], + *, + rxn_scores: Mapping[str, float] | None = None, + mip_gap: float | None = None, + time_limit: float | None = None, +) -> TaskFillResult: + """Add minimum-cost reference reactions so every task is feasible in ``model``. + + ``reference_model`` supplies the candidate reactions (those not already in + ``model``, excluding exchange/boundary reactions). ``rxn_scores`` (original + reaction id → score) sets the cost of adding each candidate as ``−min(score, + −0.1)`` (missing → cost 1). Tasks already feasible are skipped; ``should_fail`` + tasks are ignored. The model is carried forward, so later tasks see earlier + additions. Returns the gap-filled model and the reactions added. + + Boundary reactions are closed while testing/solving each task, so task inputs and + outputs come solely from the task's ranged metabolite bounds (RAVEN gap-fills the + exchange-free model). The returned model keeps its boundary reactions. + """ + scores = dict(rxn_scores or {}) + tasks = list(tasks) + in_model = {r.id for r in model.reactions} + candidates = [r for r in reference_model.reactions + if r.id not in in_model and not r.boundary] + costs = {r.id: -min(scores.get(r.id, _DEFAULT_SCORE), _MAX_SCORE) for r in candidates} + + out = model.copy() + added: list[str] = [] + failed: list[str] = [] + for task in tasks: + if task.should_fail: + continue + name_to_id, comp_to_ids = task_name_maps(out) + if _feasible(out, task, name_to_id, comp_to_ids): + continue + # Only offer reactions not yet in the (growing) model. + present = {r.id for r in out.reactions} + avail = [r for r in candidates if r.id not in present] + try: + chosen = _fill_one_task(out, avail, task, costs, mip_gap=mip_gap, time_limit=time_limit) + except RuntimeError: + failed.append(task.id) + continue + if chosen: + out.add_reactions([reference_model.reactions.get_by_id(c).copy() for c in chosen]) + added.extend(chosen) + return TaskFillResult(out, added, failed) diff --git a/src/raven_python/io/__init__.py b/src/raven_python/io/__init__.py new file mode 100644 index 0000000..bc70511 --- /dev/null +++ b/src/raven_python/io/__init__.py @@ -0,0 +1,15 @@ +"""RAVEN-specific I/O: YAML (cobra + Metabolic Atlas / Human-GEM extensions), SIF, +Excel export, and the Standard-GEM ``model//…`` git layout. +""" +from raven_python.io.excel import export_to_excel +from raven_python.io.git import export_for_git +from raven_python.io.sif import export_model_to_sif +from raven_python.io.yaml import read_yaml_model, write_yaml_model + +__all__ = [ + "export_for_git", + "export_model_to_sif", + "export_to_excel", + "read_yaml_model", + "write_yaml_model", +] diff --git a/src/raven_python/io/excel.py b/src/raven_python/io/excel.py new file mode 100644 index 0000000..cf6196e --- /dev/null +++ b/src/raven_python/io/excel.py @@ -0,0 +1,136 @@ +"""Export a model to the RAVEN Microsoft Excel format. + +Writes the five-sheet RAVEN xlsx layout — RXNS, METS, COMPS, GENES, MODEL — pulling +RAVEN-specific values back out of cobra's ``annotation`` / ``notes`` (where the +raven_python YAML reader stashes them). Excel *import* is intentionally not provided. + +Requires the optional ``openpyxl`` dependency (``pip install raven_python[excel]``). +""" +from __future__ import annotations + +from pathlib import Path + +import cobra + + +def _miriam_string(annotation: dict, exclude: tuple[str, ...] = ()) -> str: + """RAVEN MIRIAM column: ``namespace/id;namespace/id2;...`` (sorted).""" + parts = [] + for namespace in sorted(annotation): + if namespace in exclude: + continue + values = annotation[namespace] + if isinstance(values, str): + values = [values] + parts.extend(f"{namespace}/{value}" for value in values) + return ";".join(parts) + + +def _equation(rxn: cobra.Reaction) -> str: + """Human-readable equation in RAVEN ``name[comp]`` form.""" + + def side(items): + return " + ".join( + f"{abs(coef):g} {met.name}[{met.compartment}]" for met, coef in items + ) + + reactants = [(m, c) for m, c in rxn.metabolites.items() if c < 0] + products = [(m, c) for m, c in rxn.metabolites.items() if c > 0] + arrow = " <=> " if rxn.reversibility else " => " + return f"{side(reactants)}{arrow}{side(products)}" + + +def _ec_codes(rxn: cobra.Reaction) -> str: + codes = rxn.annotation.get("ec-code", []) + if isinstance(codes, str): + codes = [codes] + return ";".join(codes) + + +def export_to_excel( + model: cobra.Model, path: str | Path, *, sort_ids: bool = False +) -> None: + """Write ``model`` to a RAVEN-format ``.xlsx`` file. + + Parameters + ---------- + sort_ids + If True, write reactions/metabolites/genes sorted alphabetically by ID + (the model itself is not modified). + """ + try: + from openpyxl import Workbook + except ImportError as exc: # pragma: no cover - exercised only without openpyxl + raise ImportError( + "export_to_excel requires openpyxl. Install it with " + "`pip install raven_python[excel]` (or `pip install openpyxl`)." + ) from exc + + reactions = sorted(model.reactions, key=lambda r: r.id) if sort_ids else list(model.reactions) + metabolites = ( + sorted(model.metabolites, key=lambda m: m.id) if sort_ids else list(model.metabolites) + ) + genes = sorted(model.genes, key=lambda g: g.id) if sort_ids else list(model.genes) + metadata = dict(model.notes.get("metaData", {})) if model.notes else {} + + wb = Workbook() + wb.remove(wb.active) # drop the default empty sheet + + # --- RXNS --- + ws = wb.create_sheet("RXNS") + ws.append( + ["#", "ID", "NAME", "EQUATION", "EC-NUMBER", "GENE ASSOCIATION", "LOWER BOUND", + "UPPER BOUND", "OBJECTIVE", "COMPARTMENT", "MIRIAM", "SUBSYSTEM", + "REPLACEMENT ID", "NOTE", "REFERENCE", "CONFIDENCE SCORE"] + ) + for r in reactions: + subsystem = r.subsystem + if isinstance(subsystem, (list, tuple)): + subsystem = ";".join(subsystem) + ws.append([ + None, r.id, r.name, _equation(r), _ec_codes(r), r.gene_reaction_rule, + r.lower_bound, r.upper_bound, + r.objective_coefficient or None, None, + _miriam_string(r.annotation, exclude=("ec-code",)), subsystem, None, + r.notes.get("note"), r.notes.get("references"), r.notes.get("confidence_score"), + ]) + + # --- METS --- + ws = wb.create_sheet("METS") + ws.append(["#", "ID", "NAME", "UNCONSTRAINED", "MIRIAM", "COMPOSITION", "InChI", + "COMPARTMENT", "REPLACEMENT ID", "CHARGE"]) + for m in metabolites: + inchi = m.notes.get("inchis") + ws.append([ + None, f"{m.name}[{m.compartment}]", m.name, None, + _miriam_string(m.annotation, exclude=("smiles",)), + None if inchi else m.formula, inchi, m.compartment, m.id, m.charge, + ]) + + # --- COMPS --- + ws = wb.create_sheet("COMPS") + ws.append(["#", "ABBREVIATION", "NAME", "INSIDE", "MIRIAM"]) + comps = sorted(model.compartments) if sort_ids else list(model.compartments) + for cid in comps: + ws.append([None, cid, model.compartments.get(cid, ""), None, None]) + + # --- GENES --- + if genes: + ws = wb.create_sheet("GENES") + ws.append(["#", "NAME", "MIRIAM", "SHORT NAME", "COMPARTMENT"]) + for g in genes: + ws.append([None, g.id, _miriam_string(g.annotation), g.name, None]) + + # --- MODEL --- + ws = wb.create_sheet("MODEL") + ws.append(["#", "ID", "NAME", "TAXONOMY", "DEFAULT LOWER", "DEFAULT UPPER", + "CONTACT GIVEN NAME", "CONTACT FAMILY NAME", "CONTACT EMAIL", + "ORGANIZATION", "NOTES"]) + ws.append([ + None, model.id or "blankID", model.name or "blankName", + metadata.get("taxonomy"), metadata.get("defaultLB"), metadata.get("defaultUB"), + metadata.get("givenName"), metadata.get("familyName"), metadata.get("email"), + metadata.get("organization"), metadata.get("note"), + ]) + + wb.save(str(path)) diff --git a/src/raven_python/io/git.py b/src/raven_python/io/git.py new file mode 100644 index 0000000..80bf8e8 --- /dev/null +++ b/src/raven_python/io/git.py @@ -0,0 +1,106 @@ +"""Export a model into a Standard-GEM versioned-repository layout. + +Writes the model in several formats into the Standard-GEM folder structure (a +``model/`` directory with one subfolder per format), ready to commit to a +Git-maintained model repository (Metabolic Atlas / Human-GEM / yeast-GEM style), +plus a ``dependencies.txt`` recording tool versions. + +Thin orchestration over the writers raven_python already exposes: ``write_yaml_model``, +cobra's ``write_sbml_model`` and ``save_matlab_model``, ``export_to_excel``, plus a +single-file reaction table (txt). +""" +from __future__ import annotations + +import importlib.metadata as _md +import platform +from collections.abc import Iterable +from pathlib import Path + +import cobra + +from raven_python.io.excel import _equation, export_to_excel +from raven_python.io.yaml import write_yaml_model +from raven_python.utils.sort import sort_identifiers + +_ALL_FORMATS = ("yml", "xml", "mat", "xlsx", "txt") + + +def _version(package: str) -> str: + try: + return _md.version(package) + except _md.PackageNotFoundError: + return "unknown" + + +def _write_txt(model: cobra.Model, path: Path) -> None: + """Single-file, human-readable reaction table (RAVEN exportForGit txt).""" + with open(path, "w", encoding="utf-8") as fh: + fh.write("Rxn name\tFormula\tGene-reaction association\tLB\tUB\tObjective\n") + for r in model.reactions: + fh.write( + f"{r.id}\t{_equation(r)}\t{r.gene_reaction_rule}\t" + f"{r.lower_bound:g}\t{r.upper_bound:g}\t{r.objective_coefficient:g}\n" + ) + + +def export_for_git( + model: cobra.Model, + path: str | Path = ".", + *, + prefix: str = "model", + formats: Iterable[str] = ("yml", "xml", "mat", "xlsx"), + sub_dirs: bool = True, +) -> Path: + """Write ``model`` into a Standard-GEM repository layout. + + Parameters + ---------- + path + Directory to populate. + prefix + Base filename for every format (default ``"model"``). + formats + Which formats to write; any of ``"yml"``, ``"xml"``, ``"mat"``, + ``"xlsx"``, ``"txt"`` (default ``yml``/``xml``/``mat``/``xlsx``). + sub_dirs + If True (default), write ``model//.`` (standard-GEM + layout); otherwise all files go directly in ``path``. + + Returns + ------- + pathlib.Path + The root directory written to. + """ + formats = list(formats) + unknown = set(formats) - set(_ALL_FORMATS) + if unknown: + raise ValueError(f"Unknown format(s): {sorted(unknown)}; allowed: {_ALL_FORMATS}") + + # Sort a copy so the caller's model is untouched. + model = sort_identifiers(model.copy()) + + root = Path(path) / "model" if sub_dirs else Path(path) + root.mkdir(parents=True, exist_ok=True) + + def target(fmt: str) -> Path: + folder = root / fmt if sub_dirs else root + folder.mkdir(parents=True, exist_ok=True) + return folder / f"{prefix}.{fmt}" + + if "yml" in formats: + write_yaml_model(model, target("yml")) + if "xml" in formats: + cobra.io.write_sbml_model(model, str(target("xml"))) + if "mat" in formats: + cobra.io.save_matlab_model(model, str(target("mat"))) + if "xlsx" in formats: + export_to_excel(model, target("xlsx")) + if "txt" in formats: + _write_txt(model, target("txt")) + + with open(root / "dependencies.txt", "w", encoding="utf-8") as fh: + fh.write(f"python\t{platform.python_version()}\n") + fh.write(f"cobra\t{_version('cobra')}\n") + fh.write(f"raven_python\t{_version('raven_python')}\n") + + return root diff --git a/src/raven_python/io/sif.py b/src/raven_python/io/sif.py new file mode 100644 index 0000000..9e73efa --- /dev/null +++ b/src/raven_python/io/sif.py @@ -0,0 +1,96 @@ +"""Export a model to Cytoscape SIF (Simple Interaction Format). + +Three graph types are supported: + +* ``"rc"`` reaction–compound: each reaction linked to its metabolites; +* ``"rr"`` reaction–reaction: reactions linked when they share a metabolite; +* ``"cc"`` compound–compound: each substrate linked to the products of the + reactions it feeds (computed on an irreversible copy, as RAVEN does, to avoid + spurious double links from reversible reactions). + +A SIF line is ``source graph_type target1 target2 ...``. +""" +from __future__ import annotations + +import warnings +from collections import Counter +from collections.abc import Mapping +from pathlib import Path + +import cobra + +from raven_python.manipulation.irreversible import convert_to_irreversible + +_GRAPH_TYPES = ("rc", "rr", "cc") + + +def _edges(model, graph_type): + """Yield (source_object, [target_objects]) per the graph type.""" + if graph_type == "rc": + for rxn in model.reactions: + yield rxn, list(rxn.metabolites) + elif graph_type == "rr": + for rxn in model.reactions: + neighbours = {r for met in rxn.metabolites for r in met.reactions} + neighbours.discard(rxn) + yield rxn, list(neighbours) + else: # cc — on an irreversible copy + irrev = model.copy() + convert_to_irreversible(irrev) + for met in irrev.metabolites: + products: set = set() + for rxn in met.reactions: + if rxn.get_coefficient(met) < 0: # met is a substrate here + products.update(m for m, c in rxn.metabolites.items() if c > 0) + yield met, list(products) + + +def export_model_to_sif( + model: cobra.Model, + path: str | Path, + graph_type: str = "rc", + *, + reaction_labels: Mapping[str, str] | None = None, + metabolite_labels: Mapping[str, str] | None = None, +) -> None: + """Write ``model`` to a Cytoscape SIF file. + + Parameters + ---------- + graph_type + ``"rc"`` (reaction–compound, default), ``"rr"`` (reaction–reaction), or + ``"cc"`` (compound–compound). + reaction_labels, metabolite_labels + Optional ``{id: label}`` maps overriding the node labels (default: IDs). + """ + if graph_type not in _GRAPH_TYPES: + raise ValueError(f"graph_type must be one of {_GRAPH_TYPES}, got {graph_type!r}") + + rlabels = reaction_labels or {} + mlabels = metabolite_labels or {} + + # Warn when the label maps collapse multiple distinct ids onto the same + # label: target-side dedup runs on labels, so the collision silently merges + # two nodes into one edge. Only check the ids actually mapped (cobra default + # labels are ids, which can't collide). + for kind, lmap in (("reaction", rlabels), ("metabolite", mlabels)): + duplicates = [lab for lab, n in Counter(lmap.values()).items() if n > 1] + if duplicates: + warnings.warn( + f"{kind}_labels maps multiple ids to the same label(s) " + f"({duplicates[:5]}{'…' if len(duplicates) > 5 else ''}); " + "SIF nodes are keyed by label, so those nodes will collapse.", + stacklevel=2, + ) + + def label(obj) -> str: + if isinstance(obj, cobra.Reaction): + return rlabels.get(obj.id, obj.id) + return mlabels.get(obj.id, obj.id) + + with open(path, "w", encoding="utf-8") as handle: + for source, targets in _edges(model, graph_type): + src = label(source) + names = sorted({label(t) for t in targets} - {src}) + if names: + handle.write(f"{src}\t{graph_type}\t" + "\t".join(names) + "\n") diff --git a/src/raven_python/io/yaml.py b/src/raven_python/io/yaml.py new file mode 100644 index 0000000..151954b --- /dev/null +++ b/src/raven_python/io/yaml.py @@ -0,0 +1,191 @@ +"""Read and write RAVEN/cobrapy YAML models. + +Aligned to RAVEN ``writeYAMLmodel.m`` / ``readYAMLmodel.m`` as of the +``feat/geckopy-compat-yaml`` work (commit fa281a1), whose writer emits **cobra's +native ``!!omap`` YAML**. Because the format *is* cobra's, the standard model +content — id, name, compartments, and per-entry id/name/compartment/formula/ +charge/bounds/gene_reaction_rule/objective_coefficient/subsystem/metabolites and +the whole ``annotation`` block (which carries ``smiles`` for metabolites, +``ec-code`` for reactions, and all MIRIAM cross-references) — is read and written +by ``cobra.io`` directly. + +This module only handles what cobra drops or mishandles: + +* **RAVEN-only top-level per-entry keys** that cobra ignores: ``inchis``, + ``deltaG``, ``metFrom`` and the free-text ``notes`` (metNotes) on metabolites; + ``confidence_score``, ``references``, ``rxnFrom``, ``deltaG`` and ``notes`` + (rxnNotes) on reactions; ``protein`` on genes. These are stashed in the cobra + object's ``.notes`` dict on read and lifted back to top-level keys on write. +* **Model-level extras** cobra ignores: ``version``, the ``metaData`` provenance + block, and the GECKO sections (``gecko_light``/``ec-rxns``/``ec-enzymes``), + preserved on ``model.notes`` for round-tripping. + +The reader also accepts the older RAVEN files (id/name nested in ``metaData``). +""" +from __future__ import annotations + +import gzip +from collections import OrderedDict +from pathlib import Path + +import cobra +from cobra.io.dict import model_from_dict, model_to_dict +from cobra.io.yaml import yaml as _cobra_yaml # ruamel round-trip YAML (handles !!omap) + + +def _open_text(path: str | Path, mode: str): + """Open ``path`` as a text handle, transparently gzipping when it ends ``.gz``.""" + if str(path).endswith(".gz"): + return gzip.open(path, f"{mode}t", encoding="utf-8") + return open(path, mode, encoding="utf-8") + +# RAVEN-only top-level per-entry keys -> the key used inside the cobra object's +# .notes dict. ('notes' is RAVEN's free-text metNotes/rxnNotes; stored under +# 'note' to avoid colliding with the notes container itself.) +_MET_FIELDS = (("inchis", "inchis"), ("deltaG", "deltaG"), ("metFrom", "metFrom"), ("notes", "note")) +_RXN_FIELDS = ( + ("confidence_score", "confidence_score"), + ("references", "references"), + ("rxnFrom", "rxnFrom"), + ("deltaG", "deltaG"), + ("notes", "note"), +) +_GENE_FIELDS = (("protein", "protein"),) + +_COBRA_TOP_KEYS = frozenset({"metabolites", "reactions", "genes", "compartments", "id", "name"}) + + +def _to_plain(obj): + if isinstance(obj, dict): + return {str(k): _to_plain(v) for k, v in obj.items()} + if isinstance(obj, (list, tuple)): + return [_to_plain(v) for v in obj] + if isinstance(obj, bool) or obj is None: + return obj + if isinstance(obj, int): + return int(obj) + if isinstance(obj, float): + return float(obj) + return obj if isinstance(obj, str) else str(obj) + + +def _capture_entry_fields(entries, fields): + """Pop RAVEN-only top-level keys off each entry into a parallel notes dict. + + Returns a list of ``{notes_key: value}`` dicts aligned with ``entries`` (so + cobra never sees these keys), to be attached to the built objects afterwards. + """ + captured = [] + for entry in entries: + notes = {} + for yaml_key, notes_key in fields: + if yaml_key in entry: + notes[notes_key] = entry.pop(yaml_key) + captured.append(notes) + return captured + + +def read_yaml_model(path: str | Path) -> cobra.Model: + """Read a RAVEN/cobrapy YAML model into a ``cobra.Model``.""" + with _open_text(path, "r") as handle: + raw = _to_plain(_cobra_yaml.load(handle)) + + if not isinstance(raw, dict): + raise ValueError(f"{path}: top-level YAML is a {type(raw).__name__}, not a mapping.") + + metadata = raw.pop("metaData", None) or {} + version = raw.pop("version", None) + foreign = {k: raw.pop(k) for k in list(raw) if k not in _COBRA_TOP_KEYS} + + met_notes = _capture_entry_fields(raw.get("metabolites", []), _MET_FIELDS) + rxn_notes = _capture_entry_fields(raw.get("reactions", []), _RXN_FIELDS) + gene_notes = _capture_entry_fields(raw.get("genes", []), _GENE_FIELDS) + + model = model_from_dict(raw) + + for met, notes in zip(model.metabolites, met_notes, strict=False): + met.notes = notes + for rxn, notes in zip(model.reactions, rxn_notes, strict=False): + rxn.notes = notes + for gene, notes in zip(model.genes, gene_notes, strict=False): + gene.notes = notes + + # Legacy files keep id/name inside metaData; restore them if cobra found none. + if metadata.get("id") and not model.id: + model.id = metadata["id"] + if metadata.get("name") and not model.name: + model.name = metadata["name"] + if metadata: + model.notes["metaData"] = metadata + if version is not None: + model.notes["version"] = version + if foreign: + model.notes["_yaml_sections"] = foreign + + return model + + +def _emit_entry_fields(entries, fields): + """Lift RAVEN-only keys out of each entry's ``notes`` dict to top level.""" + for entry in entries: + notes = entry.pop("notes", None) + if not isinstance(notes, dict): + continue + notes = dict(notes) + for yaml_key, notes_key in fields: + if notes_key in notes: + entry[yaml_key] = notes.pop(notes_key) + # Preserve any remaining (non-RAVEN) notes. The RAVEN free-text note is lifted + # to the YAML key "notes"; if leftovers also exist, merge them with it under + # that key (rather than silently dropping the leftovers). + if notes: + if "notes" in entry: + notes["note"] = entry["notes"] + entry["notes"] = notes + + +def write_yaml_model( + model: cobra.Model, path: str | Path, *, sort_ids: bool = False +) -> None: + """Write a ``cobra.Model`` to RAVEN/cobrapy (``!!omap``) YAML. + + With ``sort_ids=True`` metabolites/reactions/genes/compartments are written + in alphabetical order (diff-friendly), without modifying ``model``. + """ + model_notes = dict(model.notes or {}) + stored_meta = model_notes.pop("metaData", None) or {} + version = model_notes.pop("version", None) + foreign = model_notes.pop("_yaml_sections", None) or {} + + doc = OrderedDict(_to_plain(model_to_dict(model))) + + if sort_ids: + for section in ("metabolites", "reactions", "genes"): + if section in doc: + doc[section] = sorted(doc[section], key=lambda e: e.get("id", "")) + if isinstance(doc.get("compartments"), dict): + doc["compartments"] = dict(sorted(doc["compartments"].items())) + + _emit_entry_fields(doc.get("metabolites", []), _MET_FIELDS) + _emit_entry_fields(doc.get("reactions", []), _RXN_FIELDS) + _emit_entry_fields(doc.get("genes", []), _GENE_FIELDS) + + # cobra dict order is metabolites, reactions, genes, id, name, compartments; + # append version / gecko_light / metaData / ec-* like RAVEN's writer. + if version is not None: + doc["version"] = version + metadata = dict(stored_meta) + if model.id: + metadata.setdefault("id", model.id) + if model.name: + metadata.setdefault("name", model.name) + for key in ("gecko_light",): + if key in foreign: + doc[key] = foreign.pop(key) + if metadata: + doc["metaData"] = metadata + for key, value in foreign.items(): + doc[key] = value + + with _open_text(path, "w") as handle: + _cobra_yaml.dump(doc, handle) diff --git a/src/raven_python/localization/__init__.py b/src/raven_python/localization/__init__.py new file mode 100644 index 0000000..c6071e0 --- /dev/null +++ b/src/raven_python/localization/__init__.py @@ -0,0 +1,27 @@ +"""Sub-cellular localisation — predictor-agnostic, partial-update friendly. + +:func:`predict_localization` is the MILP entry point; +:func:`load_wolfpsort` / :func:`load_deeploc` parse predictor outputs into the +``gene × compartment`` :class:`LocalizationScores` DataFrame the algorithm consumes. +""" +from raven_python.localization.predict import ( + LocalizationProposal, + LocalizationResult, + apply_localization, + predict_localization, +) +from raven_python.localization.scores import ( + LocalizationScores, + load_deeploc, + load_wolfpsort, +) + +__all__ = [ + "LocalizationProposal", + "LocalizationResult", + "LocalizationScores", + "apply_localization", + "load_deeploc", + "load_wolfpsort", + "predict_localization", +] diff --git a/src/raven_python/localization/predict.py b/src/raven_python/localization/predict.py new file mode 100644 index 0000000..0fb8596 --- /dev/null +++ b/src/raven_python/localization/predict.py @@ -0,0 +1,378 @@ +"""Sub-cellular localisation by MILP. + +Assigns reactions to compartments by maximising per-gene localisation evidence minus +inter-compartment transport cost. Key behaviour: + +* The caller passes the set of reactions to (re-)place (``reactions_to_relocate``); + everything else is pinned. Boundary reactions and existing inter-compartment + transports are always pinned even if listed. +* Incomplete models are tolerated — no silent reaction removal for "metabolite not + produced". Reactions with no scored genes are reported in ``unplaced_reactions``. +* Deterministic MILP solve (Gurobi / HiGHS / GLPK). +* ``apply=False`` returns a :class:`LocalizationProposal` (a diff) without mutating. +* **Multi-compartment by default.** A gene can land in several compartments — its + highest-scoring compartment is "free", every additional compartment costs + ``multi_compartment_penalty``. Secondary compartments naturally have lower predictor + scores (an implicit penalty) and are only picked when their score still exceeds the + explicit penalty. Set ``multi_compartment_penalty`` very high for effectively + mono-localised genes. + +Limitations to be aware of: + +* Isozyme separation is *not* applied internally — a reaction with isozymes is treated + as "all listed genes must share its compartment". For per-isozyme placement, call + :func:`raven_python.manipulation.expand_model` first. +* Transports are routed through ``default_compartment``. +""" +from __future__ import annotations + +from collections.abc import Iterable, Mapping +from dataclasses import dataclass, field + +import cobra +import pandas as pd +from optlang.symbolics import Real, add, mul + +from raven_python.localization.scores import LocalizationScores + + +@dataclass +class LocalizationProposal: + """What :func:`predict_localization` proposes, before applying it. + + All DataFrames have one row per item. Use this with ``apply=False`` to preview + changes; pass it back to :func:`apply_localization` to commit, or diff against a + curator's expectations. + """ + + moved: pd.DataFrame # rxn_id, from_compartment, to_compartment + added_transports: pd.DataFrame # met_id, compartment (other than default) + gene_compartments: dict[str, list[str]] # gene_id → list of compartments assigned + unplaced_reactions: list[str] = field(default_factory=list) # had no scored gene support + objective: float = 0.0 + + +@dataclass +class LocalizationResult: + """Outcome of :func:`predict_localization` (when ``apply=True``).""" + + model: cobra.Model + proposal: LocalizationProposal + added_transports: list[cobra.Reaction] = field(default_factory=list) + + +# --------------------------------------------------------------------- helpers + +def _reaction_compartment(rxn: cobra.Reaction) -> str | None: + """Single compartment id if all metabolites share one, else ``None`` (transport).""" + comps = {m.compartment for m in rxn.metabolites if m.compartment} + return next(iter(comps)) if len(comps) == 1 else None + + +def _reaction_genes(rxn: cobra.Reaction) -> list[str]: + """Genes on the reaction's GPR (flat list; no AND/OR distinction in this v1).""" + return [g.id for g in rxn.genes] + + +# --------------------------------------------------------------------- the MILP + +def predict_localization( + model: cobra.Model, + scores: LocalizationScores, + reactions_to_relocate: Iterable[str], + *, + default_compartment: str = "c", + transport_cost: float | Mapping[str, float] = 0.5, + multi_compartment_penalty: float = 0.5, + apply: bool = True, + mip_gap: float | None = None, + time_limit: float | None = None, +) -> LocalizationResult | LocalizationProposal: + """Place a caller-specified set of reactions in compartments via MILP. + + Returns a :class:`LocalizationProposal` (when ``apply=False``) or a + :class:`LocalizationResult` (when ``apply=True``). + + ``reactions_to_relocate``: the reaction ids to (re-)place. Everything else stays + where it is. Boundary reactions and existing multi-compartment transports passed + in this set are silently filtered out (always pinned). Pass an empty set or a list + of zero non-boundary reactions to no-op. + + ``transport_cost``: either a scalar (same cost per added transport) or a mapping + ``{metabolite_id_base: cost}`` (where the base id strips the compartment suffix, + e.g. ``"glc__D"`` matches ``"glc__D_c"``/``"glc__D_e"``). Negative costs *favour* + adding the transport. + + **Multi-compartment gene scoring (default behaviour):** a gene contributes its + predictor score in each compartment it lands in; the highest-scoring compartment + is "free", each additional compartment costs ``multi_compartment_penalty``. A + secondary compartment is only worth picking when its score (typically lower than + the primary) still exceeds the penalty — no hard cutoff, just an explicit + score-vs-penalty trade-off. Set ``multi_compartment_penalty`` very large for + effectively mono-localised genes. + """ + # ---- 1. Scope: which reactions move, which are pinned. ----------------- + to_relocate = set(reactions_to_relocate) + # Boundaries / transports always pin (even if listed). + to_relocate -= {r.id for r in model.reactions + if r.boundary or _reaction_compartment(r) is None} + if not to_relocate: + return _empty_result(model, apply) + + # ---- 2. Compartments universe (model + scores). ------------------------ + compartments = sorted(set(model.compartments) | set(scores.compartments)) + if default_compartment not in compartments: + raise ValueError(f"default_compartment={default_compartment!r} not in known " + f"compartments {compartments}") + + # ---- 3. Gather genes for the relocate-set, build score lookup. --------- + # Genes only mentioned by pinned reactions don't enter the MILP. + moving = [model.reactions.get_by_id(rid) for rid in sorted(to_relocate)] + genes_in_scope: set[str] = set() + unplaced: list[str] = [] + for r in moving: + gs = _reaction_genes(r) + scored = [g for g in gs if g in scores.df.index] + if not gs: + # GPR-less reaction: place it freely (no gene coupling). Allowed. + continue + if not scored: + # All genes absent from predictor → no signal; report and skip. + unplaced.append(r.id) + continue + genes_in_scope.update(scored) + # Remove reactions we can't score from the placement set. + placeable = [r for r in moving if r.id not in set(unplaced)] + if not placeable: + # Everything in the relocate set lacks scored genes — return a proposal with + # only the unplaced list. + prop = LocalizationProposal( + moved=pd.DataFrame(columns=["rxn_id", "from_compartment", "to_compartment"]), + added_transports=pd.DataFrame(columns=["met_id", "compartment"]), + gene_compartments={}, unplaced_reactions=unplaced, objective=0.0) + return prop if not apply else LocalizationResult(model, prop) + + # ---- 4. Per-metabolite transport cost. --------------------------------- + def _met_cost(m_id: str) -> float: + if not isinstance(transport_cost, (int, float)): + base = m_id.rsplit("_", 1)[0] + return float(transport_cost.get(base, transport_cost.get(m_id, 0.5))) + return float(transport_cost) + + # ---- 5. Build the MILP. ------------------------------------------------ + model.solver # noqa: B018 — ensure the solver is initialised so model.problem works + prob = model.problem + opt = prob.Model() + + # x[r, c] = 1 iff reaction r placed in c (only for r ∈ placeable) + x: dict[tuple[str, str], object] = { + (r.id, c): prob.Variable(f"x_{r.id}_{c}", type="binary") + for r in placeable for c in compartments + } + # y[g, c] = 1 iff gene g assigned to c + y: dict[tuple[str, str], object] = { + (g, c): prob.Variable(f"y_{g}_{c}", type="binary") + for g in genes_in_scope for c in compartments + } + # t[m_id, c] = 1 iff metabolite m (with id including its current compartment suffix) + # needs a transport to compartment c (c ≠ default). One per (base met, c). + met_keys: set[tuple[str, str]] = set() + for r in placeable: + for m in r.metabolites: + for c in compartments: + if c != default_compartment: + met_keys.add((m.id, c)) + t: dict[tuple[str, str], object] = { + k: prob.Variable(f"t_{k[0]}_{k[1]}", type="binary") for k in met_keys + } + + cons: list = [] + # 5a. Each placeable reaction goes to exactly one compartment. + for r in placeable: + cons.append(prob.Constraint(add([mul([Real(1.0), x[r.id, c]]) for c in compartments]), + lb=1.0, ub=1.0, name=f"place_{r.id}")) + # 5b. Gene-reaction coupling: if r placed in c, every scored gene of r must be in c. + for r in placeable: + for g in _reaction_genes(r): + if g not in genes_in_scope: + continue + for c in compartments: + # x[r,c] − y[g,c] ≤ 0 + cons.append(prob.Constraint(x[r.id, c] - y[g, c], ub=0.0, + name=f"gene_{r.id}_{g}_{c}")) + # 5c. Gene assignment: each gene in scope lands in ≥1 compartment. Multi is allowed; + # the multi_compartment_penalty in the objective keeps extras from coming for free. + for g in genes_in_scope: + s = add([mul([Real(1.0), y[g, c]]) for c in compartments]) + cons.append(prob.Constraint(s, lb=1.0, name=f"gene_one_{g}")) + # 5d. Transport requirement: t[m,c] ≥ x[r,c] whenever r touches m and c ≠ default. + for r in placeable: + for m in r.metabolites: + for c in compartments: + if c == default_compartment: + continue + # x[r,c] − t[m,c] ≤ 0 + cons.append(prob.Constraint(x[r.id, c] - t[m.id, c], ub=0.0, + name=f"trans_{r.id}_{m.id}_{c}")) + + opt.add(list(x.values()) + list(y.values()) + list(t.values()) + cons) + + # 5e. Objective. + obj_terms = [] + # + per-gene per-compartment localisation score (rows missing → 0) + score_lookup = scores.df # gene_id × compartment → float + for g in genes_in_scope: + for c in compartments: + s = float(score_lookup.at[g, c]) if c in score_lookup.columns and not pd.isna(score_lookup.at[g, c]) else 0.0 + if s: + obj_terms.append(mul([Real(s), y[g, c]])) + # − transport cost per added transport + for (m_id, _c), tvar in t.items(): + cost = _met_cost(m_id) + if cost: + obj_terms.append(mul([Real(-cost), tvar])) + # − multi-compartment penalty per *extra* compartment (the primary is free). + # Per gene: penalty * (Σ_c y[g,c] - 1) = penalty * Σ_c y[g,c] - penalty (constant). The + # constant doesn't affect optimisation but is added back to the reported objective so + # the value matches the "primary free" intent the user reads off the proposal. + constant_offset = 0.0 + if multi_compartment_penalty: + for yvar in y.values(): + obj_terms.append(mul([Real(-multi_compartment_penalty), yvar])) + constant_offset = multi_compartment_penalty * len(genes_in_scope) + + opt.objective = prob.Objective(add(obj_terms) if obj_terms else Real(0.0), direction="max") + if time_limit is not None: + opt.configuration.timeout = int(time_limit) + if mip_gap is not None: + try: # Gurobi-specific + opt.problem.Params.MIPGap = mip_gap + except Exception: # noqa: BLE001 + pass + + opt.optimize() + if opt.status not in ("optimal", "feasible", "suboptimal", "time_limit"): + raise RuntimeError(f"localisation MILP did not solve (status: {opt.status}).") + + # ---- 6. Read the solution into a proposal. ----------------------------- + moved_rows: list[dict] = [] + for r in placeable: + chosen = None + for c in compartments: + if (x[r.id, c].primal or 0.0) >= 0.5: + chosen = c + break + from_c = _reaction_compartment(r) + if chosen and chosen != from_c: + moved_rows.append({"rxn_id": r.id, "from_compartment": from_c, + "to_compartment": chosen}) + moved = pd.DataFrame(moved_rows, columns=["rxn_id", "from_compartment", "to_compartment"]) + + transp_rows: list[dict] = [] + for (m_id, c), tvar in t.items(): + if (tvar.primal or 0.0) >= 0.5: + transp_rows.append({"met_id": m_id, "compartment": c}) + added_transports = pd.DataFrame(transp_rows, columns=["met_id", "compartment"]) + + gene_comps: dict[str, list[str]] = {} + for g in genes_in_scope: + in_c = [c for c in compartments if (y[g, c].primal or 0.0) >= 0.5] + gene_comps[g] = in_c + + proposal = LocalizationProposal( + moved=moved, added_transports=added_transports, gene_compartments=gene_comps, + unplaced_reactions=unplaced, + objective=float((opt.objective.value or 0.0) + constant_offset)) + + if not apply: + return proposal + new_model, transports = apply_localization(model, proposal, default_compartment=default_compartment) + return LocalizationResult(model=new_model, proposal=proposal, added_transports=transports) + + +# --------------------------------------------------------------------- apply + +def apply_localization( + model: cobra.Model, + proposal: LocalizationProposal, + *, + default_compartment: str = "c", +) -> tuple[cobra.Model, list[cobra.Reaction]]: + """Apply a :class:`LocalizationProposal` to ``model``: move reactions, add the + inter-compartment transports the proposal listed, and return ``(model_copy, added)``. + + The returned model is a deep copy of the input (original left untouched). Moved + reactions get their metabolites' compartment suffix swapped (e.g. ``A_c → A_m``); + new compartment-specific metabolite copies are added on demand. Each added + transport is a passive diffusion ``M[default] ⇌ M[c]`` (RAVEN convention), + named ``tr__``. + """ + out = model.copy() + added: list[cobra.Reaction] = [] + + # 1. Move each reaction by remapping its metabolites to the target compartment. + for _, row in proposal.moved.iterrows(): + rxn = out.reactions.get_by_id(row["rxn_id"]) + target = row["to_compartment"] + new_stoich: dict[cobra.Metabolite, float] = {} + old = list(rxn.metabolites.items()) + # Clear current stoichiometry first so cobra updates the constraints cleanly. + rxn.subtract_metabolites(dict(old)) + for m, coeff in old: + m_new = _met_in_compartment(out, m, target) + new_stoich[m_new] = coeff + rxn.add_metabolites(new_stoich) + + # 2. Add transports between default and each requested compartment. + for _, row in proposal.added_transports.iterrows(): + m_id, c = row["met_id"], row["compartment"] + if m_id not in out.metabolites: + continue + m_src = out.metabolites.get_by_id(m_id) + if m_src.compartment == c: + continue # already there; no transport needed + m_default = _met_in_compartment(out, m_src, default_compartment) + m_dest = _met_in_compartment(out, m_src, c) + if m_default.id == m_dest.id: + continue + tr_id = f"tr_{_base_met_id(m_src)}_{c}" + if tr_id in out.reactions: + continue + tr = cobra.Reaction(tr_id, lower_bound=-1000, upper_bound=1000) + tr.add_metabolites({m_default: -1.0, m_dest: 1.0}) + tr.notes["localization"] = "added by predict_localization" + out.add_reactions([tr]) + added.append(out.reactions.get_by_id(tr_id)) + + return out, added + + +def _base_met_id(m: cobra.Metabolite) -> str: + """Strip the trailing ``_`` suffix (or return id as-is).""" + if m.compartment and m.id.endswith(f"_{m.compartment}"): + return m.id[: -(len(m.compartment) + 1)] + return m.id + + +def _met_in_compartment(model: cobra.Model, source: cobra.Metabolite, + compartment: str) -> cobra.Metabolite: + """Return (creating if needed) the copy of ``source`` in ``compartment``.""" + if source.compartment == compartment: + return source + base = _base_met_id(source) + new_id = f"{base}_{compartment}" + if new_id in model.metabolites: + return model.metabolites.get_by_id(new_id) + new_met = cobra.Metabolite(new_id, name=source.name, compartment=compartment, + formula=source.formula, charge=source.charge) + new_met.notes = dict(source.notes or {}) + model.add_metabolites([new_met]) + return new_met + + +def _empty_result(model: cobra.Model, apply_flag: bool): + proposal = LocalizationProposal( + moved=pd.DataFrame(columns=["rxn_id", "from_compartment", "to_compartment"]), + added_transports=pd.DataFrame(columns=["met_id", "compartment"]), + gene_compartments={}, unplaced_reactions=[], objective=0.0) + return proposal if not apply_flag else LocalizationResult(model.copy(), proposal) diff --git a/src/raven_python/localization/scores.py b/src/raven_python/localization/scores.py new file mode 100644 index 0000000..4fabc03 --- /dev/null +++ b/src/raven_python/localization/scores.py @@ -0,0 +1,117 @@ +"""Loaders for gene → compartment localisation predictors (WoLF PSORT, DeepLoc, …). + +The localisation algorithm in :mod:`raven_python.localization.predict` consumes a +*gene × compartment* score table (:class:`LocalizationScores`) where higher = stronger +evidence. Each predictor produces this differently; loaders here normalise them. The +format is open — a user can build a :class:`LocalizationScores` from any source by +constructing the :class:`pandas.DataFrame` directly. + +Each loader normalises each gene's row so the best compartment is 1.0 (RAVEN's +``parseScores`` convention), which lets transport costs be set on a comparable scale. +""" +from __future__ import annotations + +import re +from collections.abc import Mapping +from dataclasses import dataclass +from pathlib import Path + +import pandas as pd + + +@dataclass +class LocalizationScores: + """Per-gene compartment scores. ``df`` is indexed by ``gene_id`` with one column per + compartment id; values are floats (higher = stronger evidence for that compartment). + + Genes absent from ``df`` and NaN entries are treated as "no signal" by + :func:`raven_python.localization.predict_localization` (uniform prior contribution). + """ + + df: pd.DataFrame + + def __post_init__(self) -> None: + if not isinstance(self.df.index, pd.Index) or self.df.index.name not in (None, "gene_id"): + # accept but normalise + self.df = self.df.copy() + self.df.index.name = "gene_id" + + @property + def genes(self) -> list[str]: + return list(self.df.index) + + @property + def compartments(self) -> list[str]: + return list(self.df.columns) + + def with_compartments(self, mapping: Mapping[str, str]) -> LocalizationScores: + """Rename compartment columns via ``{old: new}`` (e.g. predictor labels → + model compartments). Unmapped columns are kept; multiple sources can be merged + with ``df.combine_first`` afterwards.""" + return LocalizationScores(self.df.rename(columns=dict(mapping))) + + +# ----------------------------------------------------------------------- WoLF PSORT + +# WoLF PSORT summary lines look like: +# PROTEIN_ID cyto 13, nucl 7, mito 4 +# with comments starting '#' and noisy 'treating ...' lines (which we drop). +_WOLF_COMMA = re.compile(r"[,]\s*") + + +def load_wolfpsort(path: str | Path) -> LocalizationScores: + """Parse WoLF PSORT summary output (``runWolfPsortSummary``) into a normalised + :class:`LocalizationScores`. Rows like ``PROT: treating N X's as ...`` are skipped.""" + rows: dict[str, dict[str, float]] = {} + for line in Path(path).read_text().splitlines(): + line = line.strip() + if not line or line.startswith("#") or "treating " in line: + continue + tokens = _WOLF_COMMA.sub(" ", line).split() + if len(tokens) < 3 or (len(tokens) - 1) % 2 != 0: + continue # malformed; skip + gene = tokens[0] + comp_scores: dict[str, float] = {} + for comp, score in zip(tokens[1::2], tokens[2::2], strict=True): + try: + comp_scores[comp] = float(score) + except ValueError: + continue + if comp_scores: + rows[gene] = comp_scores + df = pd.DataFrame.from_dict(rows, orient="index").fillna(0.0) + df.index.name = "gene_id" + return _normalise_rows(LocalizationScores(df)) + + +# ----------------------------------------------------------------------- DeepLoc + +def load_deeploc(path: str | Path) -> LocalizationScores: + """Parse DeepLoc 2 CSV output into a normalised :class:`LocalizationScores`. + + DeepLoc 2's per-protein CSV has columns ``Protein_ID, Localizations, Signals, + , , ...`` where columns 4+ are per-class probabilities. + The first three metadata columns are dropped; the rest become compartment columns. + """ + df = pd.read_csv(path) + if df.shape[1] < 4: + raise ValueError(f"{path}: expected ≥4 columns from DeepLoc, got {list(df.columns)}") + gene_col = df.columns[0] # Protein_ID + comp_cols = list(df.columns[3:]) # cols 0-2 are Protein_ID/Localizations/Signals metadata + scores = df.set_index(gene_col)[comp_cols].apply(pd.to_numeric, errors="coerce").fillna(0.0) + scores.index.name = "gene_id" + return _normalise_rows(LocalizationScores(scores)) + + +# ----------------------------------------------------------------------- helpers + +def _normalise_rows(s: LocalizationScores) -> LocalizationScores: + """Per-gene row normalisation: best compartment → 1.0 (RAVEN's parseScores convention). + + Rows whose max is ≤0 are left unscaled (no positive evidence to normalise against). + """ + df = s.df.copy() + row_max = df.max(axis=1) + safe = row_max > 0 + df.loc[safe] = df.loc[safe].div(row_max[safe], axis=0) + return LocalizationScores(df) diff --git a/src/raven_python/manipulation/__init__.py b/src/raven_python/manipulation/__init__.py new file mode 100644 index 0000000..074c36f --- /dev/null +++ b/src/raven_python/manipulation/__init__.py @@ -0,0 +1,36 @@ +"""Generic cobra.Model structural transforms that cobrapy does not cover cleanly: +reaction building from equations, batch GPR / bound changes, irreversibility splitting, +isozyme expansion, compartment merge / copy, and model merging by name.""" +from .add import add_reactions_from_equations +from .change import change_gene_reaction_rules, change_reaction_equations +from .expand import expand_model +from .irreversible import convert_to_irreversible +from .merge import merge_models +from .parameters import set_variance_bounds +from .remove import remove_genes, remove_metabolites +from .simplify import ( + constrain_reversible_reactions, + group_linear_reactions, + remove_dead_end_reactions, + remove_duplicate_reactions, +) +from .transfer import add_reactions_from_model +from .transport import add_transport_reactions + +__all__ = [ + "add_reactions_from_equations", + "add_reactions_from_model", + "add_transport_reactions", + "change_gene_reaction_rules", + "change_reaction_equations", + "constrain_reversible_reactions", + "convert_to_irreversible", + "expand_model", + "group_linear_reactions", + "merge_models", + "remove_dead_end_reactions", + "remove_duplicate_reactions", + "remove_genes", + "remove_metabolites", + "set_variance_bounds", +] diff --git a/src/raven_python/manipulation/add.py b/src/raven_python/manipulation/add.py new file mode 100644 index 0000000..3842297 --- /dev/null +++ b/src/raven_python/manipulation/add.py @@ -0,0 +1,345 @@ +"""Add reactions to a model from equation strings. + +Most of the equivalent MATLAB code is struct-of-arrays bookkeeping (padding parallel +``rxnNames`` / ``lb`` / ``ub`` / ``grRules`` / ... fields) that does not exist in +cobra, where each ``Reaction`` carries its own attributes. cobra also already +covers a large part of the *behaviour*: + +* ``Reaction.build_reaction_from_string`` parses equation strings, coefficients, + and reversibility arrows (``<=>``, ``-->``, ``=>``) and creates unknown + metabolites — but only matching metabolites **by ID**, and it leaves new + metabolites with ``compartment=None``. +* assigning ``reaction.gene_reaction_rule`` auto-creates ``Gene`` objects. + +So this port keeps only the parts cobra lacks: + +* **name-based matching** — interpret equation tokens as metabolite *names* + (RAVEN eqnType 2) or as ``name[comp]`` (eqnType 3), not just IDs; +* **correct compartment** assignment for newly created metabolites; +* **strict policies** — optionally *error* (rather than silently create) on + unknown metabolites or genes, and always error on a duplicate reaction ID + (cobra silently ignores those). + +Instead of RAVEN's ``eqnType`` integer (1/2/3) the matching mode is a readable +keyword: ``mets_by="id"`` or ``mets_by="name"``, with ``name[comp]`` recognised +automatically. See IMPROVEMENTS.md (A-series) for the rationale. +""" +from __future__ import annotations + +import re +import warnings +from collections import OrderedDict +from collections.abc import Mapping, Sequence + +import cobra +from cobra import Metabolite, Reaction +from cobra.core.gene import GPR + +from raven_python.utils.parse import parse_name_comp + +# Reversibility arrows. ``<=>`` must be tried before ``=>`` (it contains it). +_REVERSIBLE_ARROWS = ("<=>",) +_FORWARD_ARROWS = ("-->", "->", "=>") + + +def _split_equation(equation: str) -> tuple[str, str, bool]: + """Split an equation into (lhs, rhs, reversible) on its arrow.""" + for arrow in _REVERSIBLE_ARROWS: + if arrow in equation: + lhs, rhs = equation.split(arrow, 1) + return lhs, rhs, True + for arrow in _FORWARD_ARROWS: + if arrow in equation: + lhs, rhs = equation.split(arrow, 1) + return lhs, rhs, False + raise ValueError(f"No reaction arrow (<=>, -->, =>) found in equation: {equation!r}") + + +def _parse_side(side: str) -> list[tuple[float, str, str | None]]: + """Parse one side of an equation into ``[(coefficient, token, fallback), ...]``. + + The ``fallback`` slot is for the ambiguous ``" "`` shape: when + matching by name, ``"2 oxoglutarate"`` could be either ``coeff=2, name="oxoglutarate"`` + or ``coeff=1, name="2 oxoglutarate"`` (a real chemistry name). We return the + coefficient-split form as the primary and the full term as the fallback; the + resolver picks whichever matches an existing metabolite. Pure-number heads + with no name (``"2"``) and pure-name terms (``"glucose"``) have no fallback. + """ + terms: list[tuple[float, str, str | None]] = [] + for raw in side.split(" + "): + term = raw.strip() + if not term: + continue + head, _, tail = term.partition(" ") + try: + coeff = float(head) + token = tail.strip() + except ValueError: + coeff, token = 1.0, term + fallback = None + else: + # Coefficient-split succeeded. Keep the full term as a fallback when + # the tail is non-empty so name-resolution can re-try it as one token. + fallback = term if token else None + if not token: + raise ValueError(f"Missing metabolite after coefficient in term: {raw!r}") + terms.append((coeff, token, fallback)) + return terms + + +def _new_met_id(model: cobra.Model, prefix: str) -> str: + """Next free ```` metabolite ID (RAVEN m1, m2, ... scheme).""" + pattern = re.compile(rf"^{re.escape(prefix)}(\d+)$") + used = [int(m.group(1)) for met in model.metabolites if (m := pattern.match(met.id))] + n = max(used) + 1 if used else 1 + while f"{prefix}{n}" in model.metabolites: + n += 1 + return f"{prefix}{n}" + + +def _try_existing( + model: cobra.Model, token: str, *, mets_by: str, compartment: str | None +) -> Metabolite | None: + """Look up ``token`` as an existing metabolite (no creation, no side effects). + + Returns the matching metabolite or ``None``. Used by ``_stoichiometry`` to + disambiguate the ``" "`` shape: if a metabolite whose *name* + (or id) literally contains a leading number exists, prefer it over splitting + the number off as a coefficient. + """ + name, comp = parse_name_comp(token) + if mets_by == "id" and comp is None: + return model.metabolites.get_by_id(token) if token in model.metabolites else None + target_comp = comp if comp is not None else compartment + if target_comp is None: + return None + for met in model.metabolites: + if met.name == name and met.compartment == target_comp: + return met + return None + + +def _resolve_metabolite( + model: cobra.Model, + token: str, + *, + mets_by: str, + compartment: str | None, + allow_new_mets: bool, + new_met_prefix: str, +) -> Metabolite: + """Resolve an equation token to an existing or newly created Metabolite.""" + name, comp = parse_name_comp(token) + + if mets_by == "id" and comp is None: + # token is a metabolite ID + if token in model.metabolites: + return model.metabolites.get_by_id(token) + if not allow_new_mets: + raise ValueError( + f"Unknown metabolite ID {token!r}; pass allow_new_mets=True to create it." + ) + if compartment is None: + raise ValueError( + f"Cannot create metabolite {token!r}: no compartment given." + ) + _warn_unknown_compartment(model, compartment, token) + met = Metabolite(token, compartment=compartment) + model.add_metabolites([met]) + return met + + # name-based (mets_by="name") or explicit name[comp] + target_comp = comp if comp is not None else compartment + if target_comp is None: + raise ValueError( + f"Metabolite {token!r} matched by name needs a compartment; " + "pass compartment=... or use the name[comp] syntax." + ) + if comp is not None and target_comp not in model.compartments and not allow_new_mets: + raise ValueError(f"Compartment {target_comp!r} is not in the model.") + + matches = [ + met + for met in model.metabolites + if met.name == name and met.compartment == target_comp + ] + if matches: + return matches[0] + if not allow_new_mets: + raise ValueError( + f"No metabolite named {name!r} in compartment {target_comp!r}; " + "pass allow_new_mets=True to create it." + ) + _warn_unknown_compartment(model, target_comp, name) + met = Metabolite(_new_met_id(model, new_met_prefix), name=name, compartment=target_comp) + model.add_metabolites([met]) + return met + + +def _warn_unknown_compartment(model: cobra.Model, compartment: str, identifier: str) -> None: + """Warn when a new metabolite would be born into a not-yet-registered compartment. + + Both ``mets_by`` paths previously created the metabolite without validating + the compartment, so a typo (``"cyto"`` for ``"c"``) silently produced a + one-metabolite ghost compartment. cobra inherits the compartment from the + first metabolite assigned to it, so the fix is a warning, not a hard error. + """ + known = set(model.compartments) | set(model._compartments) + if compartment not in known: + warnings.warn( + f"Creating metabolite {identifier!r} in unregistered compartment " + f"{compartment!r} (existing: {sorted(known) or 'none'}); " + "add the compartment first or check for a typo.", + stacklevel=5, + ) + + +def _stoichiometry( + model: cobra.Model, + equation: str, + *, + mets_by: str, + compartment: str | None, + allow_new_mets: bool, + new_met_prefix: str, +) -> tuple[dict[Metabolite, float], bool]: + """Parse an equation into a {Metabolite: net coefficient} dict + reversibility.""" + lhs, rhs, reversible = _split_equation(equation) + coeffs: OrderedDict[Metabolite, float] = OrderedDict() + had_terms = False + for sign, side in ((-1.0, lhs), (1.0, rhs)): + for coeff, token, fallback in _parse_side(side): + had_terms = True + # " " is ambiguous when the name itself starts with a + # number (e.g. "2 oxoglutarate"). Prefer the full-term interpretation + # when it matches an existing metabolite — otherwise fall through to + # the coefficient-split form. + met = None + if fallback is not None: + met = _try_existing( + model, fallback, mets_by=mets_by, compartment=compartment + ) + if met is not None: + coeff = 1.0 + if met is None: + met = _resolve_metabolite( + model, + token, + mets_by=mets_by, + compartment=compartment, + allow_new_mets=allow_new_mets, + new_met_prefix=new_met_prefix, + ) + coeffs[met] = coeffs.get(met, 0.0) + sign * coeff + # Drop metabolites that net to zero (present as both substrate and product). + coeffs = OrderedDict((met, c) for met, c in coeffs.items() if c != 0.0) + if had_terms and not coeffs: + warnings.warn( + f"Equation {equation!r} has no net metabolites (all terms cancelled); " + "the reaction will be added with empty stoichiometry.", + stacklevel=4, + ) + return dict(coeffs), reversible + + +def add_reactions_from_equations( + model: cobra.Model, + reactions: Sequence[Mapping], + *, + mets_by: str = "id", + compartment: str | None = None, + allow_new_mets: bool = True, + allow_new_genes: bool = True, + new_met_prefix: str = "m", +) -> list[Reaction]: + """Add reactions defined by equation strings, matching mets by ID or name. + Parameters + ---------- + model + Target ``cobra.Model``, mutated in place. + reactions + Sequence of mappings, one per reaction. Recognised keys: + + * ``id`` (**required**) — reaction ID; must not already exist. + * ``equation`` (**required**) — e.g. ``"atp_c + h2o_c <=> adp_c + pi_c"``. + Use ``<=>`` for reversible, ``-->``/``->``/``=>`` for irreversible. + * ``name`` — reaction name. + * ``bounds`` — ``(lower, upper)`` tuple; overrides the arrow. + * ``gene_reaction_rule`` — GPR string. + * ``subsystem`` — subsystem name. + mets_by + How bare equation tokens (without ``[comp]``) are matched: + ``"id"`` (RAVEN eqnType 1) or ``"name"`` (eqnType 2). A ``name[comp]`` + token (eqnType 3) is always matched by name + compartment. + compartment + Default compartment for new metabolites and for name-matched tokens + without an explicit ``[comp]``. + allow_new_mets + If True (default), create metabolites not found. New metabolites get + ``compartment`` (id mode) or an auto ID ``m1``, ``m2``, ... (name mode). + If False, an unknown metabolite raises. + allow_new_genes + If True (default), genes in a GPR are auto-created by cobra. If False, + a GPR referencing a gene not already in the model raises. + new_met_prefix + Prefix for auto-generated metabolite IDs in name mode (default ``"m"``). + + Returns + ------- + list of cobra.Reaction + The reactions added, in input order. + """ + if mets_by not in ("id", "name"): + raise ValueError(f"mets_by must be 'id' or 'name', got {mets_by!r}") + + known_genes = {gene.id for gene in model.genes} + added: list[Reaction] = [] + + for spec in reactions: + if "id" not in spec: + raise ValueError(f"Reaction spec missing required 'id': {spec!r}") + rxn_id = spec["id"] + if rxn_id in model.reactions: + raise ValueError( + f"Reaction {rxn_id!r} already exists; use changeRxns or remove it first." + ) + if "equation" not in spec: + raise ValueError(f"Reaction {rxn_id!r} spec missing required 'equation'.") + + coeffs, reversible = _stoichiometry( + model, + spec["equation"], + mets_by=mets_by, + compartment=compartment, + allow_new_mets=allow_new_mets, + new_met_prefix=new_met_prefix, + ) + + rxn = Reaction(rxn_id, name=spec.get("name", "")) + if "bounds" in spec: + rxn.bounds = tuple(spec["bounds"]) + else: + config = cobra.Configuration() + lower = config.lower_bound if reversible else 0.0 + rxn.bounds = (lower, config.upper_bound) + if "subsystem" in spec: + rxn.subsystem = spec["subsystem"] + + model.add_reactions([rxn]) + rxn.add_metabolites(coeffs) + + rule = spec.get("gene_reaction_rule", "") + if rule: + if not allow_new_genes: + missing = sorted(set(GPR.from_string(rule).genes) - known_genes) + if missing: + raise ValueError( + f"Reaction {rxn_id!r} references genes not in the model: " + f"{missing}. Set allow_new_genes=True or add them first." + ) + rxn.gene_reaction_rule = rule + known_genes.update(gene.id for gene in rxn.genes) + + added.append(rxn) + + return added diff --git a/src/raven_python/manipulation/change.py b/src/raven_python/manipulation/change.py new file mode 100644 index 0000000..78612ba --- /dev/null +++ b/src/raven_python/manipulation/change.py @@ -0,0 +1,125 @@ +"""Change the stoichiometry of existing reactions from equation strings. + +Editing the same ``Reaction`` object changes only its stoichiometry — its id, name, +bounds, GPR, subsystem, and position are preserved automatically by cobra. + +So this port simply re-parses the equation (reusing the same metabolite +matching as :func:`~raven_python.manipulation.add.add_reactions_from_equations`, +including name and ``name[comp]`` modes that cobra lacks) and swaps the +metabolites in place. + +Like RAVEN, **bounds are left unchanged** even if the new equation's arrow +implies a different reversibility — use a bounds setter for that. +""" +from __future__ import annotations + +from collections.abc import Mapping + +import cobra +from cobra import Reaction + +from raven_python.manipulation.add import _stoichiometry + +__all__ = ["change_reaction_equations", "change_gene_reaction_rules"] + + +def change_reaction_equations( + model: cobra.Model, + equations: Mapping[str, str], + *, + mets_by: str = "id", + compartment: str | None = None, + allow_new_mets: bool = True, + new_met_prefix: str = "m", +) -> list[Reaction]: + """Replace the stoichiometry of existing reactions. + Parameters + ---------- + model + Target ``cobra.Model``, mutated in place. + equations + Mapping of ``reaction_id -> equation string``. Every ID must already + exist in the model. Equation syntax is identical to + :func:`~raven_python.manipulation.add.add_reactions_from_equations`. + mets_by, compartment, allow_new_mets, new_met_prefix + Metabolite-matching options, as in ``add_reactions_from_equations``. + + Returns + ------- + list of cobra.Reaction + The reactions changed, in input order. + + Notes + ----- + Bounds are **not** modified, matching RAVEN. Changing an equation from + ``-->`` to ``<=>`` does not by itself make the reaction reversible; adjust + the bounds separately. + """ + if mets_by not in ("id", "name"): + raise ValueError(f"mets_by must be 'id' or 'name', got {mets_by!r}") + + changed: list[Reaction] = [] + for rxn_id, equation in equations.items(): + if rxn_id not in model.reactions: + raise ValueError(f"Reaction {rxn_id!r} not found in the model.") + rxn = model.reactions.get_by_id(rxn_id) + + coeffs, _reversible = _stoichiometry( + model, + equation, + mets_by=mets_by, + compartment=compartment, + allow_new_mets=allow_new_mets, + new_met_prefix=new_met_prefix, + ) + + rxn.subtract_metabolites(dict(rxn.metabolites), combine=True) + rxn.add_metabolites(coeffs) + changed.append(rxn) + + return changed + + +def change_gene_reaction_rules( + model: cobra.Model, + rules: Mapping[str, str], + *, + replace: bool = True, +) -> list[Reaction]: + """Set or append gene-reaction rules on existing reactions. + cobra already does the heavy lifting on assignment to + ``reaction.gene_reaction_rule``: it auto-creates any new ``Gene`` objects and + normalises the rule. So the value here is batching plus RAVEN's ``replace`` + option to **append** rather than overwrite. + + Parameters + ---------- + model + Target ``cobra.Model``, mutated in place. + rules + Mapping of ``reaction_id -> GPR string``. Every ID must already exist. + replace + If True (default), overwrite the existing GPR. If False, append the new + rule as an isozyme: ``(old) or (new)`` (just ``new`` if the reaction had + no GPR). + + Returns + ------- + list of cobra.Reaction + The reactions changed, in input order. + """ + changed: list[Reaction] = [] + for rxn_id, rule in rules.items(): + if rxn_id not in model.reactions: + raise ValueError(f"Reaction {rxn_id!r} not found in the model.") + rxn = model.reactions.get_by_id(rxn_id) + + if replace or not rxn.gene_reaction_rule: + new_rule = rule + else: + new_rule = f"({rxn.gene_reaction_rule}) or ({rule})" + + rxn.gene_reaction_rule = new_rule # cobra creates genes + normalises + changed.append(rxn) + + return changed diff --git a/src/raven_python/manipulation/compartments.py b/src/raven_python/manipulation/compartments.py new file mode 100644 index 0000000..091d196 --- /dev/null +++ b/src/raven_python/manipulation/compartments.py @@ -0,0 +1,196 @@ +"""Compartment manipulation — merge all compartments into one, or copy reactions to a +new compartment (ports of RAVEN's ``mergeCompartments`` and ``copyToComps``). + +Both functions are useful **independently of** :func:`raven_python.localization.predict_localization`: +``merge_compartments`` flattens a multi-compartment model for a simplified analysis +(e.g. checking whether the network can in principle make a metabolite, with no +compartment topology in the way); ``copy_to_compartment`` is a building block for +constructing dual-localised pathways. cobra has no equivalents. +""" +from __future__ import annotations + +from collections.abc import Iterable + +import cobra + +# Compartments produced by merge_compartments (RAVEN uses 's' for "system"). +_MERGED_COMPARTMENT = "s" + + +def merge_compartments( + model: cobra.Model, + *, + merged_id: str = _MERGED_COMPARTMENT, + merged_name: str = "system", + drop_single_metabolite_reactions: bool = True, + deduplicate_reactions: bool = True, +) -> tuple[cobra.Model, list[str], list[str]]: + """Merge every metabolite of ``model`` into one ``merged_id`` compartment. + + Returns ``(model_copy, deleted_single_met_reactions, deduplicated_reactions)``. The + returned model is a deep copy of the input. Use cases: + + * Check whether the network can produce/consume a metabolite at all (compartment + topology is often what makes a model look blocked). + * Simplify a model for visualisation or an analysis that doesn't care about + compartments. + * As a pre-step for localisation when the user does want RAVEN's + "start from scratch" workflow (call :func:`merge_compartments` then + :func:`raven_python.localization.predict_localization` with the full reaction list). + + Metabolites that already share a base id (e.g. ``glc__D_c`` and ``glc__D_e`` both + map to ``glc__D``) collapse into one entity in the merged compartment; their + stoichiometric contributions are summed per reaction. Reactions that end up with + only one metabolite (e.g. ``A[c] → A[m]`` becomes ``A → A`` = nothing) are deleted + by default (RAVEN's ``deleteRxnsWithOneMet``). Reactions that become identical + after merging are deduplicated (one survives). + """ + out = model.copy() + + # 1. For each metabolite, derive a base id (strip the trailing _). + # Two mets in different compartments sharing the base id collapse to one. + new_to_old: dict[str, list[cobra.Metabolite]] = {} + for m in list(out.metabolites): + base = _base_id(m) + new_to_old.setdefault(base, []).append(m) + + # 2. Build the merged metabolites and rewrite reactions. + canonical: dict[str, cobra.Metabolite] = {} + for base, mets in new_to_old.items(): + proto = mets[0] + new_met = cobra.Metabolite(base, name=proto.name, compartment=merged_id, + formula=proto.formula, charge=proto.charge) + new_met.notes = dict(proto.notes or {}) + canonical[base] = new_met + + # Rewrite all reactions: replace each metabolite with its canonical, summing + # coefficients where multiple original mets collapse to one. + rewritten: dict[str, dict[str, float]] = {} + for r in list(out.reactions): + new_stoich: dict[cobra.Metabolite, float] = {} + for m, coeff in list(r.metabolites.items()): + canon = canonical[_base_id(m)] + new_stoich[canon] = new_stoich.get(canon, 0.0) + coeff + # Drop zero net coefficients (substrate + product of the same base met cancel). + new_stoich = {m: c for m, c in new_stoich.items() if c != 0.0} + rewritten[r.id] = {m.id: c for m, c in new_stoich.items()} + + # Now build a fresh model with the canonical mets + rewritten reactions; the + # cobra in-place rewrite would require careful constraint surgery, so a clean + # rebuild is simpler and less error-prone. + merged = cobra.Model(out.id or "merged") + merged.compartments = {merged_id: merged_name} + merged.add_metabolites(list(canonical.values())) + deleted_single: list[str] = [] + deduplicated: list[str] = [] + seen_signatures: dict[tuple, str] = {} + keep_reactions: list[cobra.Reaction] = [] + for r in out.reactions: + stoich = rewritten[r.id] + if drop_single_metabolite_reactions and len(stoich) <= 1: + deleted_single.append(r.id) + continue + if not stoich: # everything cancelled + deleted_single.append(r.id) + continue + sig = (frozenset(stoich.items()), bool(r.lower_bound < 0), bool(r.upper_bound > 0)) + if deduplicate_reactions and sig in seen_signatures: + deduplicated.append(r.id) + continue + seen_signatures[sig] = r.id + new_r = cobra.Reaction(r.id, name=r.name, lower_bound=r.lower_bound, + upper_bound=r.upper_bound) + new_r.add_metabolites({merged.metabolites.get_by_id(mid): c for mid, c in stoich.items()}) + new_r.gene_reaction_rule = r.gene_reaction_rule + if r.subsystem: + new_r.subsystem = r.subsystem + new_r.notes = dict(r.notes or {}) + keep_reactions.append(new_r) + merged.add_reactions(keep_reactions) + return merged, deleted_single, deduplicated + + +def copy_to_compartment( + model: cobra.Model, + reactions: Iterable[str], + target_compartment: str, + *, + target_compartment_name: str | None = None, + delete_original: bool = False, + id_suffix: str | None = None, +) -> tuple[cobra.Model, list[str], list[str]]: + """Copy a set of reactions into ``target_compartment``. RAVEN's ``copyToComps``. + + Returns ``(model_copy, new_reaction_ids, new_metabolite_ids)``. Use cases: + + * Build a dual-localised pathway (e.g. duplicate glycolysis into a peroxisome). + * Mirror a curated subsystem into an additional compartment as a draft to refine. + * Set up the input for a flux comparison between alternate compartmentalisations. + + Each copied reaction is given the id ``"_"`` (default + ``id_suffix=target_compartment``); each metabolite it touches is mapped to (or + created in) ``target_compartment`` with the same suffix convention. ``delete_original=True`` + moves the reactions instead of copying. + """ + out = model.copy() + suffix = id_suffix if id_suffix is not None else target_compartment + if target_compartment not in out.compartments: + out.compartments = {**out.compartments, + target_compartment: target_compartment_name or target_compartment} + + preexisting_met_ids = {x.id for x in out.metabolites} + new_rxn_ids: list[str] = [] + for rid in list(reactions): + if rid not in out.reactions: + raise ValueError(f"reaction {rid!r} not in model") + src = out.reactions.get_by_id(rid) + new_id = f"{rid}_{suffix}" + if new_id in out.reactions: + continue # already copied; idempotent + new_stoich: dict[cobra.Metabolite, float] = {} + for m, coeff in src.metabolites.items(): + target_met = _met_in_compartment(out, m, target_compartment, suffix=suffix) + new_stoich[target_met] = coeff + new_r = cobra.Reaction(new_id, name=src.name, + lower_bound=src.lower_bound, upper_bound=src.upper_bound) + new_r.add_metabolites(new_stoich) + new_r.gene_reaction_rule = src.gene_reaction_rule + if src.subsystem: + new_r.subsystem = src.subsystem + new_r.notes = dict(src.notes or {}) + out.add_reactions([new_r]) + new_rxn_ids.append(new_id) + if delete_original: + out.remove_reactions([src.id], remove_orphans=False) + + new_met_ids = [m.id for m in out.metabolites if m.id not in preexisting_met_ids] + return out, new_rxn_ids, new_met_ids + + +# ----------------------------------------------------------------- helpers + +def _base_id(m: cobra.Metabolite) -> str: + """Strip the trailing ``_`` suffix from a metabolite id (if present).""" + if m.compartment and m.id.endswith(f"_{m.compartment}"): + return m.id[: -(len(m.compartment) + 1)] + return m.id + + +def _met_in_compartment(model: cobra.Model, source: cobra.Metabolite, + compartment: str, *, suffix: str | None = None) -> cobra.Metabolite: + """Return (creating if needed) the copy of ``source`` in ``compartment``. + + The new metabolite id is ``"_"`` (default ``suffix=compartment``). + Already-existing copies are reused. + """ + if source.compartment == compartment: + return source + base = _base_id(source) + new_id = f"{base}_{suffix if suffix is not None else compartment}" + if new_id in model.metabolites: + return model.metabolites.get_by_id(new_id) + new_met = cobra.Metabolite(new_id, name=source.name, compartment=compartment, + formula=source.formula, charge=source.charge) + new_met.notes = dict(source.notes or {}) + model.add_metabolites([new_met]) + return new_met diff --git a/src/raven_python/manipulation/expand.py b/src/raven_python/manipulation/expand.py new file mode 100644 index 0000000..246f3b9 --- /dev/null +++ b/src/raven_python/manipulation/expand.py @@ -0,0 +1,124 @@ +"""Expand reactions with isozymes into one reaction per isozyme. + +Operates on cobra's GPR AST, so the model stays a plain ``cobra.Model`` throughout. + +Provenance: this implementation was first written for geckopy +(``geckopy/ec_model/pipeline/expand.py``, where it backed makeEcModel stage 5) +and is adopted here as its canonical home; geckopy will import it from raven_python +once raven_python is published. + +MATLAB-COMPAT: GECKO MATLAB and RAVEN ``expandModel.m`` use string manipulation +on grRules to detect and split isozymes. raven_python uses cobrapy's GPR AST +instead. Output should be equivalent for any well-formed GPR; cases that differ +are likely malformed GPR strings that the AST flags as invalid. +""" +from __future__ import annotations + +import ast +import copy + +import cobra +from cobra.core.gene import GPR + + +def _gpr_to_dnf(gpr: GPR) -> list[list[str]]: + """Convert a GPR to disjunctive normal form (list of AND-clauses). + + An empty GPR yields an empty list. A single clause (no OR anywhere) + yields a list of length 1. OR-of-ANDs yields one sublist per + disjunct, each containing the gene names ANDed together. + + Handles distributivity: ``g1 and (g2 or g3)`` becomes + ``[[g1, g2], [g1, g3]]``. + """ + if gpr is None or gpr.body is None: + return [] + return _node_to_dnf(gpr.body) + + +def _node_to_dnf(node) -> list[list[str]]: + """Recursive helper. Returns DNF as list of AND-clauses.""" + if isinstance(node, ast.Name): + return [[node.id]] + if isinstance(node, ast.BoolOp): + if isinstance(node.op, ast.Or): + result: list[list[str]] = [] + for child in node.values: + result.extend(_node_to_dnf(child)) + return result + if isinstance(node.op, ast.And): + clauses: list[list[str]] = [[]] + for child in node.values: + child_dnf = _node_to_dnf(child) + new_clauses: list[list[str]] = [] + for existing in clauses: + for extra in child_dnf: + new_clauses.append(existing + extra) + clauses = new_clauses + return clauses + raise ValueError(f"Unexpected GPR node type: {type(node).__name__}") + + +def expand_model(model: cobra.Model) -> list[str]: + """Split reactions with isozymes (OR in GPR) into one reaction per isozyme. + For each reaction whose GPR contains at least one OR, the reaction + is removed and replaced by one copy per disjunctive clause. The new + reactions get ID suffix ``_EXP_1``, ``_EXP_2``, etc. All other + fields (stoichiometry, bounds, name, subsystem) are copied verbatim; + only the GPR is simplified to the single AND-clause for that + isozyme. + + Reactions with no GPR, or with a GPR that has no OR, are left + untouched. + + Parameters + ---------- + model + A cobra.Model, mutated in place. + + Returns + ------- + list of str + Sorted IDs of newly added expanded reactions (those with + ``_EXP_N`` suffixes). The original reactions that were split + are no longer in the model. + """ + expansions: list[tuple[cobra.Reaction, list[list[str]]]] = [] + + for rxn in model.reactions: + if not rxn.gene_reaction_rule: + continue + clauses = _gpr_to_dnf(rxn.gpr) + if len(clauses) <= 1: + continue + expansions.append((rxn, clauses)) + + added_ids: list[str] = [] + for original_rxn, clauses in expansions: + new_rxns: list[cobra.Reaction] = [] + for i, clause in enumerate(clauses, start=1): + new_rxn = cobra.Reaction( + id=f"{original_rxn.id}_EXP_{i}", + name=original_rxn.name, + ) + new_rxn.lower_bound = original_rxn.lower_bound + new_rxn.upper_bound = original_rxn.upper_bound + new_rxn.add_metabolites(dict(original_rxn.metabolites.items())) + new_rxn.subsystem = original_rxn.subsystem + new_rxn.gene_reaction_rule = " and ".join(clause) + # Propagate per-reaction metadata (notably ec-code / annotations) + # so downstream functions see the same annotations on expanded + # reactions as on the original. Deep-copy so siblings are independent. + new_rxn.annotation = copy.deepcopy(original_rxn.annotation) + new_rxn.notes = copy.deepcopy(original_rxn.notes) + new_rxns.append(new_rxn) + + obj_coeff = original_rxn.objective_coefficient + model.remove_reactions([original_rxn]) + model.add_reactions(new_rxns) + if obj_coeff: # keep the original in the objective — sum over its isozyme copies + for new_rxn in new_rxns: + new_rxn.objective_coefficient = obj_coeff + added_ids.extend(r.id for r in new_rxns) + + return sorted(added_ids) diff --git a/src/raven_python/manipulation/irreversible.py b/src/raven_python/manipulation/irreversible.py new file mode 100644 index 0000000..3f64a68 --- /dev/null +++ b/src/raven_python/manipulation/irreversible.py @@ -0,0 +1,72 @@ +"""Convert reversible reactions to an irreversible (forward + reverse) form. + +cobrapy's own ``convert_to_irreversible`` was removed, so this is a genuine +implementation rather than a wrapper. + +Provenance: first written for geckopy +(``geckopy/ec_model/pipeline/preprocess.py``, makeEcModel stage 4, tagged +"RAVENpy candidate") and adopted here as its canonical home; geckopy will +import it from raven_python once raven_python is published. +""" +from __future__ import annotations + +import cobra + + +def convert_to_irreversible(model: cobra.Model) -> list[str]: + """Split non-exchange reversible reactions into a forward + reverse pair. + For each non-exchange reaction with ``lb < 0``: + + - The original reaction is kept as the forward direction. Its + lower bound is clamped to 0. + - A new reaction with the same ID plus a ``_REV`` suffix is added, + representing the reverse direction. Its stoichiometry is the + negation of the original, its bounds are ``(0, -original_lb)``, + and it inherits the name (with " (reversible)" appended) and the + gene-protein rule of the original. + + Exchange reactions (boundary reactions) are never split, regardless + of their bounds, matching MATLAB behavior where exchange reactions + are explicitly excluded from ``convertToIrrev``. + + Parameters + ---------- + model + A cobra.Model, mutated in place. + + Returns + ------- + list of str + Sorted IDs of newly added reverse reactions (the ones ending in + ``_REV``). The forward reactions retain their original IDs. + """ + reverse_rxns_to_add: list[cobra.Reaction] = [] + forward_updates: list[cobra.Reaction] = [] + + for rxn in model.reactions: + if rxn.boundary: + continue + if rxn.lower_bound >= 0: + continue + + original_lb = rxn.lower_bound + + rev_rxn = cobra.Reaction( + id=f"{rxn.id}_REV", + name=(f"{rxn.name} (reversible)" if rxn.name else f"{rxn.id}_REV"), + ) + rev_rxn.lower_bound = 0.0 + rev_rxn.upper_bound = -original_lb + rev_rxn.add_metabolites({m: -c for m, c in rxn.metabolites.items()}) + rev_rxn.gene_reaction_rule = rxn.gene_reaction_rule + + reverse_rxns_to_add.append(rev_rxn) + forward_updates.append(rxn) + + for rxn in forward_updates: + rxn.lower_bound = 0.0 + + if reverse_rxns_to_add: + model.add_reactions(reverse_rxns_to_add) + + return sorted(r.id for r in reverse_rxns_to_add) diff --git a/src/raven_python/manipulation/merge.py b/src/raven_python/manipulation/merge.py new file mode 100644 index 0000000..bfa1f24 --- /dev/null +++ b/src/raven_python/manipulation/merge.py @@ -0,0 +1,146 @@ +"""Merge several models into one. + +cobra's ``Model.merge`` is pairwise and matches everything strictly by id; this +merges **N** models and unifies metabolites by **name[compartment]** (so the same +compound under different ids in two models becomes one), while adding **all** +reactions without de-duplication +(a reaction whose ID already exists is renamed ``id_``). Genes are +unified by ID. Provenance (which source model each object came from) is recorded +in ``notes['origin']``. + +The bulk of RAVEN's function is struct field-padding and manual S-matrix +assembly, none of which is needed on ``cobra.Model``. +""" +from __future__ import annotations + +import copy +import warnings +from collections.abc import Iterable + +import cobra +from cobra import Metabolite, Model, Reaction + + +def _unique_id(existing, base: str, suffix: str) -> str: + """Return base, or base_suffix (then base_suffix_2, ...) if it collides.""" + if base not in existing: + return base + candidate = f"{base}_{suffix}" + n = 2 + while candidate in existing: + candidate = f"{base}_{suffix}_{n}" + n += 1 + return candidate + + +def merge_models( + models: Iterable[cobra.Model], + *, + match_by: str = "name", + track_origin: bool = True, +) -> cobra.Model: + """Merge models into a single new model. + Parameters + ---------- + models + The models to merge (two or more). A single model is returned as a copy. + match_by + How metabolites are unified across models: ``"name"`` (default) treats + metabolites with the same *name and compartment* as identical (IDs + ignored); ``"id"`` matches by metabolite ID. + track_origin + If True (default), record the source model's ``id`` in each reaction's, + metabolite's, and gene's ``notes['origin']``. + + Returns + ------- + cobra.Model + A new merged model (``id="MERGED"``). Reactions are **not** de-duplicated + — matching RAVEN, every reaction from every model is kept, with ID + collisions renamed ``id_``. + """ + models = list(models) + if not models: + raise ValueError("merge_models requires at least one model.") + if match_by not in ("name", "id"): + raise ValueError(f"match_by must be 'name' or 'id', got {match_by!r}") + if len(models) == 1: + return models[0].copy() + + merged = Model("MERGED") + comp_names: dict[str, str] = {} + met_lookup: dict = {} # name/comp or id key -> merged Metabolite + + def met_key(met: Metabolite): + return (met.name, met.compartment) if match_by == "name" else met.id + + def ensure_metabolite(src: Metabolite, origin: str) -> Metabolite: + key = met_key(src) + if key in met_lookup: + existing = met_lookup[key] + # Two source models can map to the same name[comp] (or id) with + # different formula/charge; silently picking the first-seen has + # quietly corrupted mass balance in the past. Warn so the caller + # sees the conflict. + if src.formula and existing.formula and src.formula != existing.formula: + warnings.warn( + f"merge_models: metabolite {existing.id!r} (from earlier model) " + f"and {src.id!r} (from {origin!r}) share key {key!r} but " + f"have different formulas ({existing.formula!r} vs {src.formula!r}); " + "keeping the first.", + stacklevel=3, + ) + if ( + existing.charge is not None + and src.charge is not None + and existing.charge != src.charge + ): + warnings.warn( + f"merge_models: metabolite {existing.id!r} (from earlier model) " + f"and {src.id!r} (from {origin!r}) share key {key!r} but " + f"have different charges ({existing.charge} vs {src.charge}); " + "keeping the first.", + stacklevel=3, + ) + return existing + new_id = _unique_id(merged.metabolites, src.id, origin) + new_met = Metabolite( + new_id, name=src.name, compartment=src.compartment, + formula=src.formula, charge=src.charge, + ) + new_met.annotation = copy.deepcopy(src.annotation) + new_met.notes = copy.deepcopy(src.notes) + if track_origin: + new_met.notes.setdefault("origin", origin) + merged.add_metabolites([new_met]) + met_lookup[key] = new_met + return new_met + + for model in models: + origin = model.id or "model" + comp_names.update(model.compartments) + genes_before = {g.id for g in merged.genes} + + for rxn in model.reactions: + new_id = _unique_id(merged.reactions, rxn.id, origin) + new_rxn = Reaction(new_id, name=rxn.name) + new_rxn.bounds = rxn.bounds + new_rxn.subsystem = rxn.subsystem + merged.add_reactions([new_rxn]) + new_rxn.add_metabolites( + {ensure_metabolite(m, origin): coef for m, coef in rxn.metabolites.items()} + ) + if rxn.gene_reaction_rule: + new_rxn.gene_reaction_rule = rxn.gene_reaction_rule + new_rxn.annotation = copy.deepcopy(rxn.annotation) + new_rxn.notes = copy.deepcopy(rxn.notes) + if track_origin: + new_rxn.notes.setdefault("origin", origin) + + if track_origin: + for gene in merged.genes: + if gene.id not in genes_before: + gene.notes.setdefault("origin", origin) + + merged._compartments.update(comp_names) + return merged diff --git a/src/raven_python/manipulation/parameters.py b/src/raven_python/manipulation/parameters.py new file mode 100644 index 0000000..f349804 --- /dev/null +++ b/src/raven_python/manipulation/parameters.py @@ -0,0 +1,78 @@ +"""Set reaction bounds to a sign-aware ±% variance band around measured values. + +Cobra has no idiom for the *variance band* case (e.g. "5 ± 20 %"); the other common +bound-setting cases are cobra one-liners: + +* fixed lb / ub → ``reaction.lower_bound`` / ``upper_bound`` / ``reaction.bounds`` +* equality → ``reaction.bounds = (v, v)`` +* objective → ``model.objective = {reaction: coeff}`` +* unconstrained → ``reaction.bounds = cobra.Configuration().bounds`` +""" +from __future__ import annotations + +from collections.abc import Iterable, Sequence + +import cobra +from cobra import Reaction + +Number = int | float + + +def _resolve(model: cobra.Model, reactions) -> list[Reaction]: + if isinstance(reactions, (str, Reaction)): + reactions = [reactions] + out: list[Reaction] = [] + for r in reactions: + if isinstance(r, Reaction): + out.append(r) + elif r in model.reactions: + out.append(model.reactions.get_by_id(r)) + else: + raise ValueError(f"Reaction {r!r} not found in the model.") + return out + + +def _broadcast(value, n: int) -> list[float]: + if isinstance(value, (int, float)): + return [float(value)] * n + vals = [float(v) for v in value] + if len(vals) != n: + raise ValueError( + f"Expected 1 or {n} values to match the reactions, got {len(vals)}." + ) + return vals + + +def set_variance_bounds( + model: cobra.Model, + reactions: str | Reaction | Iterable, + values: Number | Sequence[Number], + percent: Number, +) -> list[Reaction]: + """Constrain reactions to a ``±percent/2`` band around measured values. + + For a measured value ``v`` and ``percent`` ``p``, the bounds become + ``v * (1 - p/200) .. v * (1 + p/200)`` — i.e. ``percent`` is the *total* + width, split half above and half below. For a negative ``v`` the two are + swapped so that ``lb <= ub``. E.g. ``percent=5`` gives 97.5 %..102.5 % of ``v``. + + Parameters + ---------- + reactions + Reaction IDs or objects. + values + Measured value per reaction; a scalar is broadcast to all reactions. + percent + Total band width as a percentage. + + Returns + ------- + list of cobra.Reaction + The reactions affected. + """ + rxns = _resolve(model, reactions) + half = percent / 200.0 + for rxn, v in zip(rxns, _broadcast(values, len(rxns)), strict=True): + lo, hi = v * (1 - half), v * (1 + half) + rxn.bounds = (hi, lo) if v < 0 else (lo, hi) + return rxns diff --git a/src/raven_python/manipulation/remove.py b/src/raven_python/manipulation/remove.py new file mode 100644 index 0000000..492de36 --- /dev/null +++ b/src/raven_python/manipulation/remove.py @@ -0,0 +1,120 @@ +"""Remove metabolites or genes from a model. + +For removing *reactions*, use cobra directly: +``cobra.Model.remove_reactions(reactions, remove_orphans=...)``. + +The two functions here delegate the core to cobra and add the cobra-absent behaviour: + +* ``remove_metabolites`` — cobra matches metabolites by ID; RAVEN's ``isNames`` + deletes a metabolite in **every compartment at once** by name. That name + resolution is the *sole* reason this wrapper exists (see the note on it). +* ``remove_genes`` — cobra's ``cobra.manipulation.remove_genes`` already rewrites + GPRs through the boolean AST (removing one gene of ``A and B`` empties the + rule, of ``A or B`` keeps the other) — exactly RAVEN's intent, without its + ``eval``. The gap is RAVEN's default of **constraining** flux-blocked reactions + to zero instead of deleting them; exposed as ``blocked_reactions``. +""" +from __future__ import annotations + +from collections.abc import Iterable + +import cobra +from cobra import Gene, Metabolite +from cobra.manipulation import remove_genes as _cobra_remove_genes + + +def _as_list(obj) -> list: + if isinstance(obj, (str, Metabolite, Gene)): + return [obj] + return list(obj) + + +def remove_metabolites( + model: cobra.Model, + metabolites: str | Metabolite | Iterable, + *, + by_name: bool = False, + destructive: bool = False, +) -> None: + """Remove metabolites, optionally matching by name across all compartments. + + Parameters + ---------- + by_name + If True, ``metabolites`` are metabolite *names*; every metabolite with a + matching name is removed, regardless of compartment (RAVEN ``isNames``). + If False, they are IDs/objects, resolved via cobra. + destructive + Passed to cobra: if True, also remove every reaction the metabolite + participates in. + + Note + ---- + With ``by_name=False`` this is just ``model.remove_metabolites`` — the + ``by_name`` cross-compartment deletion is the only thing this adds over cobra. + """ + if by_name: + wanted = set(_as_list(metabolites)) + targets = [m for m in model.metabolites if m.name in wanted] + else: + targets = model.metabolites.get_by_any(_as_list(metabolites)) + if targets: + model.remove_metabolites(targets, destructive=destructive) + + +def remove_genes( + model: cobra.Model, + genes: str | Gene | Iterable, + *, + blocked_reactions: str = "remove", + remove_orphans: bool = False, +) -> list[str]: + """Remove genes and handle reactions left unable to carry flux. + + GPR rewriting (with correct AND/OR semantics) and gene deletion are done by cobra; + this adds a policy for reactions whose GPR becomes empty (no enzyme left): + + * ``"remove"`` — delete them (cobra's default). + * ``"constrain"`` — keep them but set bounds to ``(0, 0)``. + * ``"keep"`` — leave them with an empty GPR and unchanged bounds. + + ``remove_orphans`` (only meaningful with ``blocked_reactions="remove"``) + passes through to cobra: drop metabolites *and* genes orphaned by the removal. + + Returns + ------- + list of str + IDs of the reactions that became flux-blocked (had a GPR, now empty). + """ + if blocked_reactions not in ("remove", "constrain", "keep"): + raise ValueError( + f"blocked_reactions must be 'remove', 'constrain', or 'keep', " + f"got {blocked_reactions!r}" + ) + + # Resolve to gene IDs that are actually in the model (RAVEN filters likewise). + requested = [g.id if isinstance(g, Gene) else g for g in _as_list(genes)] + present = [gid for gid in requested if gid in model.genes] + if not present: + return [] + + # Reactions touched by these genes that currently have a GPR. + affected = set() + for gid in present: + affected.update(r.id for r in model.genes.get_by_id(gid).reactions) + had_gpr = {rid for rid in affected if model.reactions.get_by_id(rid).gene_reaction_rule} + + # cobra rewrites GPRs (AST) and removes the gene objects; we manage reactions. + _cobra_remove_genes(model, present, remove_reactions=False) + + blocked = [ + rid for rid in had_gpr if not model.reactions.get_by_id(rid).gene_reaction_rule + ] + + if blocked_reactions == "remove": + model.remove_reactions(blocked, remove_orphans=remove_orphans) + elif blocked_reactions == "constrain": + for rid in blocked: + model.reactions.get_by_id(rid).bounds = (0, 0) + + return sorted(blocked) diff --git a/src/raven_python/manipulation/simplify.py b/src/raven_python/manipulation/simplify.py new file mode 100644 index 0000000..2deaccd --- /dev/null +++ b/src/raven_python/manipulation/simplify.py @@ -0,0 +1,229 @@ +"""Reduce a model by removing/merging reactions that cannot carry flux. + +Four reduction modes that cobra does not cover out of the box: +``remove_dead_end_reactions`` (reactions whose substrates have no producer), +``remove_duplicate_reactions``, ``constrain_reversible_reactions`` (tighten bounds +via FVA), and ``group_linear_reactions`` (lossy fold of unit-stoichiometry chains +into one reaction; drops gene rules). + +Cobra-covered modes that you'd reach for separately: + +* No-flux removal → ``cobra.flux_analysis.find_blocked_reactions``. +* Zero-interval removal → filter reactions with ``bounds == (0, 0)`` then prune. +""" +from __future__ import annotations + +import math +from collections.abc import Iterable + +import cobra +from cobra.flux_analysis import flux_variability_analysis + +from raven_python.manipulation.irreversible import convert_to_irreversible + + +def _prune_orphan_metabolites(model: cobra.Model) -> list[str]: + orphans = [m for m in model.metabolites if not m.reactions] + if orphans: + model.remove_metabolites(orphans) + return [m.id for m in orphans] + + +def _can_produce_and_consume(met) -> tuple[bool, bool]: + """Whether the network can both produce and consume ``met`` (given directions).""" + produce = consume = False + for rxn in met.reactions: + coef = rxn.get_coefficient(met) + if coef > 0: + produce |= rxn.upper_bound > 0 + consume |= rxn.lower_bound < 0 + elif coef < 0: + consume |= rxn.upper_bound > 0 + produce |= rxn.lower_bound < 0 + return produce, consume + + +def remove_dead_end_reactions( + model: cobra.Model, *, reserved: Iterable[str] | None = None +) -> tuple[list[str], list[str]]: + """Iteratively remove dead-end reactions and metabolites. + + A metabolite + is a dead end if it participates in only one reaction, or if (accounting for + reaction directionality) it can only be produced or only consumed — such + metabolites cannot carry steady-state flux, so the reactions touching them + are removed. Repeats until stable. + + Returns ``(removed_reaction_ids, removed_metabolite_ids)``. + """ + reserved = set(reserved or []) + removed_rxns: list[str] = [] + removed_mets: list[str] = [] + while True: + removed_mets += _prune_orphan_metabolites(model) + dead = [ + m + for m in model.metabolites + if len(m.reactions) <= 1 or not all(_can_produce_and_consume(m)) + ] + if not dead: + break + rxns = {r for m in dead for r in m.reactions} + to_delete = [r for r in rxns if r.id not in reserved] + if not to_delete: + break + removed_rxns += [r.id for r in to_delete] + model.remove_reactions(to_delete) + return removed_rxns, removed_mets + + +def _signature(rxn): + mets = frozenset((m.id, c) for m, c in rxn.metabolites.items()) + return (mets, rxn.lower_bound, rxn.upper_bound, rxn.objective_coefficient) + + +def remove_duplicate_reactions( + model: cobra.Model, *, reserved: Iterable[str] | None = None +) -> list[str]: + """Remove all-but-one of each set of duplicate reactions. + + Reactions are duplicates when they have identical stoichiometry, bounds, and + objective coefficient. One of each set is kept (reserved reactions are never + removed). Returns the removed reaction IDs. + """ + reserved = set(reserved or []) + groups: dict = {} + for rxn in model.reactions: + groups.setdefault(_signature(rxn), []).append(rxn) + + removed: list[str] = [] + for rxns in groups.values(): + if len(rxns) <= 1: + continue + keep = rxns[-1] + to_remove = [r for r in rxns if r is not keep and r.id not in reserved] + if to_remove: + removed += [r.id for r in to_remove] + model.remove_reactions(to_remove) + return removed + + +def constrain_reversible_reactions( + model: cobra.Model, *, eps: float = 1e-9 +) -> list[str]: + """Constrain reversible reactions that can only carry flux one way. + + Runs FVA on + each reversible reaction; if it can only carry forward flux its lower bound + is set to 0, and if it can only carry reverse flux it is flipped to a forward + reaction (stoichiometry, bounds, and objective negated). Returns the changed + reaction IDs. + """ + revs = [r for r in model.reactions if r.lower_bound < 0 < r.upper_bound] + if not revs: + return [] + # Infeasible models surface as either OptimizationError (Gurobi/HiGHS) or + # NaN-filled ranges (some optlang backends silently). Catch both and raise + # a single clear error — the original ``abs(NaN) < eps`` comparison would + # have silently no-op'd, letting bogus "all reactions truly reversible" + # decisions sneak through. + try: + fva = flux_variability_analysis( + model, reaction_list=revs, fraction_of_optimum=0.0 + ) + except Exception as exc: # noqa: BLE001 - solver-family agnostic + raise RuntimeError( + "constrain_reversible_reactions: FVA failed — the model is likely " + "infeasible at fraction_of_optimum=0. Fix the infeasibility first " + "(often a missing exchange or an over-constrained essential). " + f"({exc})" + ) from exc + if fva[["minimum", "maximum"]].isna().any().any(): + raise RuntimeError( + "constrain_reversible_reactions: FVA returned NaN ranges — the " + "model is infeasible at fraction_of_optimum=0. Fix the infeasibility " + "first (often a missing exchange or an over-constrained essential)." + ) + + changed: list[str] = [] + for rxn in revs: + lo = fva.at[rxn.id, "minimum"] + hi = fva.at[rxn.id, "maximum"] + # Guard against ±inf ranges (unbounded objective): treat them as truly + # reversible rather than "zero" by the abs(·) < eps check. + if math.isinf(lo) or math.isinf(hi): + continue + min_zero, max_zero = abs(lo) < eps, abs(hi) < eps + if min_zero == max_zero: # both ~0 (blocked) or both nonzero (truly reversible) + continue + if max_zero: # only reverse flux → flip to a forward reaction + old_lb = rxn.lower_bound + rxn.add_metabolites({m: -2 * c for m, c in rxn.metabolites.items()}) + rxn.bounds = (0.0, -old_lb) + rxn.objective_coefficient = -rxn.objective_coefficient + else: # only forward flux + rxn.lower_bound = 0.0 + changed.append(rxn.id) + return changed + + +def group_linear_reactions( + model: cobra.Model, *, reserved: Iterable[str] | None = None +) -> None: + """Merge linear (single-producer, single-consumer) reaction chains. + + **Lossy**: gene-reaction + associations are discarded (RAVEN does the same), since merged reactions have + no meaningful combined GPR. The model is first made irreversible, then any + metabolite that is produced by exactly one reaction and consumed by exactly + one reaction is eliminated by merging the two reactions. Mutates in place. + """ + reserved = set(reserved or []) + + # Lossy: drop all gene information. + for rxn in model.reactions: + rxn.gene_reaction_rule = "" + for gene in list(model.genes): + model.genes.remove(gene) + + convert_to_irreversible(model) + + # Worklist of metabolites to (re)consider for merging. Each metabolite + # participating in a merge can expose new linear chains in its neighbours, + # so we re-enqueue the touched mets rather than restart the whole scan + # (the old O(n²·m) restart-after-every-merge loop). + pending: list = list(model.metabolites) + seen_in_pass: set = set() + while pending: + met = pending.pop() + if met not in model.metabolites: # removed in a previous merge + continue + rxns = list(met.reactions) + if len(rxns) != 2 or any(r.id in reserved for r in rxns): + continue + r1, r2 = rxns + c1, c2 = r1.get_coefficient(met), r2.get_coefficient(met) + if (c1 > 0) == (c2 > 0): # need one producer and one consumer + continue + ratio = abs(c1 / c2) + new_lb = max(r1.lower_bound, r2.lower_bound / ratio) + new_ub = min(r1.upper_bound, r2.upper_bound / ratio) + new_obj = r1.objective_coefficient + r2.objective_coefficient * ratio + # Re-enqueue every metabolite touched by either side — the merge can + # turn neighbours into single-producer/consumer chains in turn. + touched = {m for m in r1.metabolites} | {m for m in r2.metabolites} + # Merge r2*ratio into r1; the shared metabolite cancels and is dropped. + r1.add_metabolites({m: c * ratio for m, c in r2.metabolites.items()}) + model.remove_reactions([r2]) + r1.bounds = (new_lb, new_ub) + r1.objective_coefficient = new_obj + seen_in_pass.clear() + for m in touched: + if m in model.metabolites and id(m) not in seen_in_pass: + seen_in_pass.add(id(m)) + pending.append(m) + # One terminal cleanup pass (cheap; only what remains). + empty = [r for r in model.reactions if not r.metabolites] + if empty: + model.remove_reactions(empty) + _prune_orphan_metabolites(model) diff --git a/src/raven_python/manipulation/transfer.py b/src/raven_python/manipulation/transfer.py new file mode 100644 index 0000000..b867f02 --- /dev/null +++ b/src/raven_python/manipulation/transfer.py @@ -0,0 +1,144 @@ +"""Copy reactions (with their metabolites and genes) from another model. + +cobra's ``Model.merge`` / ``add_reactions`` match metabolites strictly by id. This +transfers a chosen set of reactions from a *source* model into a draft, matching +metabolites by **name[compartment]** instead — so a compound present in both models +under different ids is reused rather than duplicated, and only genuinely new +metabolites are created (copying the source's id, formula, +charge, and annotation). New genes are auto-created by cobra when the GPR is set. +This is the post-``getModelFromHomology`` "copy a few more reactions across" +workflow. +""" +from __future__ import annotations + +import copy +from collections.abc import Iterable + +import cobra +from cobra import Metabolite, Reaction + +from raven_python.manipulation.add import _new_met_id + + +def _name_comp(met: Metabolite) -> str: + return f"{met.name}[{met.compartment}]" + + +def add_reactions_from_model( + model: cobra.Model, + source_model: cobra.Model, + reactions: str | Iterable[str], + *, + genes: bool | str | Iterable[str] = False, + note: str | None = "Added via add_reactions_from_model()", + confidence: int | None = None, +) -> list[Reaction]: + """Copy reactions from ``source_model`` into ``model``. + Parameters + ---------- + model + Draft model to copy into (mutated in place). + source_model + Model to copy reactions from. + reactions + Reaction ID(s) in ``source_model``. Reactions already present in + ``model`` (by ID) are skipped. + genes + ``False`` (default): add reactions without GPRs. ``True``: copy each + reaction's GPR from the source. A string: use it as the GPR for every + added reaction. A list: per-reaction GPRs (matching the reactions that + are actually added). New genes are created automatically. + note + Stored in each added reaction's ``notes['note']`` (set ``None`` to skip). + confidence + If given, stored in each added reaction's ``notes['confidence_score']``. + + Returns + ------- + list of cobra.Reaction + The reactions added, in input order. + """ + rxn_ids = [reactions] if isinstance(reactions, str) else list(reactions) + missing = [r for r in rxn_ids if r not in source_model.reactions] + if missing: + raise ValueError(f"Reactions not found in the source model: {missing}") + + new_ids = [r for r in rxn_ids if r not in model.reactions] + if not new_ids: + raise ValueError("All reactions are already in the model.") + source_rxns = [source_model.reactions.get_by_id(r) for r in new_ids] + + if genes is False: + rules = [""] * len(source_rxns) + elif genes is True: + rules = [r.gene_reaction_rule for r in source_rxns] + elif isinstance(genes, str): + rules = [genes] * len(source_rxns) + else: + rules = list(genes) + if len(rules) != len(source_rxns): + raise ValueError( + f"genes list has {len(rules)} rules but {len(source_rxns)} " + "reactions are being added." + ) + + # Match metabolites by name[comp]; create only the genuinely new ones. + draft_by_name = {_name_comp(m): m for m in model.metabolites} + new_mets: list[Metabolite] = [] + pending: set[str] = set() + # Track ids minted within this batch so two source mets that share an id + # but differ in name[comp] don't collide when add_metabolites runs. + pending_ids: set[str] = set() + for srx in source_rxns: + for met in srx.metabolites: + key = _name_comp(met) + if key in draft_by_name or key in pending: + continue + pending.add(key) + if met.id not in model.metabolites and met.id not in pending_ids: + new_id = met.id + else: + # _new_met_id only knows the model; loop past in-batch hits too. + new_id = _new_met_id(model, "m") + while new_id in pending_ids: + n = int(new_id[1:]) + 1 + new_id = f"m{n}" + while new_id in model.metabolites: + n += 1 + new_id = f"m{n}" + pending_ids.add(new_id) + new_met = Metabolite( + new_id, + name=met.name, + compartment=met.compartment, + formula=met.formula, + charge=met.charge, + ) + new_met.annotation = copy.deepcopy(met.annotation) + new_met.notes = copy.deepcopy(met.notes) + new_mets.append(new_met) + draft_by_name[key] = new_met + if new_mets: + model.add_metabolites(new_mets) + + added: list[Reaction] = [] + for srx, rule in zip(source_rxns, rules, strict=True): + rxn = Reaction(srx.id, name=srx.name) + rxn.bounds = srx.bounds + rxn.subsystem = srx.subsystem + model.add_reactions([rxn]) + rxn.add_metabolites( + {draft_by_name[_name_comp(met)]: coef for met, coef in srx.metabolites.items()} + ) + if rule: + rxn.gene_reaction_rule = rule + rxn.annotation = copy.deepcopy(srx.annotation) + notes = copy.deepcopy(srx.notes) + if note is not None: + notes["note"] = note + if confidence is not None: + notes["confidence_score"] = confidence + rxn.notes = notes + added.append(rxn) + + return added diff --git a/src/raven_python/manipulation/transport.py b/src/raven_python/manipulation/transport.py new file mode 100644 index 0000000..d0c1bf1 --- /dev/null +++ b/src/raven_python/manipulation/transport.py @@ -0,0 +1,157 @@ +"""Add transport reactions between compartments. + +cobra has no transport-reaction primitive. For each metabolite this matches the +species by *name* across compartments (the source in ``from_compartment`` and its +same-named twin in each target compartment), optionally creating the target +metabolite, and +builds a ``-1 from / +1 to`` reaction with a sequential ``tr_0001`` ID. +""" +from __future__ import annotations + +import re +import warnings +from collections.abc import Iterable + +import cobra +from cobra import Metabolite, Reaction + +from raven_python.manipulation.add import _new_met_id + + +def _index_by_name(mets: Iterable[Metabolite], compartment: str) -> dict[str, Metabolite]: + """Index metabolites by name, warning when a name is duplicated. + + Same-name duplicates in a single compartment are unusual but legal in cobra, + and the previous one-pass dict comprehension silently dropped all but one. + """ + out: dict[str, list[Metabolite]] = {} + for m in mets: + out.setdefault(m.name, []).append(m) + chosen: dict[str, Metabolite] = {} + for name, group in out.items(): + if len(group) > 1: + warnings.warn( + f"Multiple metabolites named {name!r} in compartment {compartment!r} " + f"({[m.id for m in group]}); using {group[0].id!r} for transport.", + stacklevel=3, + ) + chosen[name] = group[0] + return chosen + + +def _transport_id_factory(model: cobra.Model, prefix: str): + pattern = re.compile(rf"^{re.escape(prefix)}(\d+)$") + used = [int(m.group(1)) for r in model.reactions if (m := pattern.match(r.id))] + counter = max(used) + 1 if used else 1 + + def next_id() -> str: + nonlocal counter + while f"{prefix}{counter:04d}" in model.reactions: + counter += 1 + rid = f"{prefix}{counter:04d}" + counter += 1 + return rid + + return next_id + + +def add_transport_reactions( + model: cobra.Model, + from_compartment: str, + to_compartments: str | Iterable[str], + metabolite_names: str | Iterable[str] | None = None, + *, + reversible: bool = True, + only_to_existing: bool = True, + id_prefix: str = "tr_", +) -> list[Reaction]: + """Add transport reactions from one compartment to one or more others. + Parameters + ---------- + from_compartment + Source compartment id. + to_compartments + Target compartment id(s). + metabolite_names + Names of metabolites to transport. Default: every metabolite in + ``from_compartment``. + reversible + If True (default), bounds span the cobra configuration default + (reversible); otherwise lower bound 0. + only_to_existing + If True (default), only transport a metabolite into a target + compartment where a same-named metabolite already exists. If False, + create the missing target metabolite (copying name/formula/charge/ + annotation from the source) before adding the transport. + id_prefix + Prefix for the sequential reaction IDs (``tr_0001``, ...). + + Returns + ------- + list of cobra.Reaction + The transport reactions added, in creation order. + """ + # cobra's `model.compartments` only lists compartments that have metabolites; + # include registered-but-empty ones so transport can target an empty compartment. + known = set(model.compartments) | set(model._compartments) + if from_compartment not in known: + raise ValueError(f"Compartment {from_compartment!r} is not in the model.") + if isinstance(to_compartments, str): + to_compartments = [to_compartments] + else: + to_compartments = list(to_compartments) + for comp in to_compartments: + if comp not in known: + raise ValueError(f"Compartment {comp!r} is not in the model.") + + source = _index_by_name( + (m for m in model.metabolites if m.compartment == from_compartment), + from_compartment, + ) + if metabolite_names is None: + names = list(source) + else: + names = [metabolite_names] if isinstance(metabolite_names, str) else list(metabolite_names) + missing = [n for n in names if n not in source] + if missing: + raise ValueError( + f"Metabolites not found in compartment {from_compartment!r}: {missing}" + ) + + cfg = cobra.Configuration() + bounds = (cfg.lower_bound, cfg.upper_bound) if reversible else (0.0, cfg.upper_bound) + from_name = model.compartments.get(from_compartment) or from_compartment + next_id = _transport_id_factory(model, id_prefix) + + added: list[Reaction] = [] + for to_comp in to_compartments: + to_name = model.compartments.get(to_comp) or to_comp + targets = _index_by_name( + (m for m in model.metabolites if m.compartment == to_comp), + to_comp, + ) + for name in names: + src = source[name] + dst = targets.get(name) + if dst is None: + if only_to_existing: + continue + dst = Metabolite( + _new_met_id(model, "m"), + name=name, + compartment=to_comp, + formula=src.formula, + charge=src.charge, + ) + dst.annotation = dict(src.annotation) + model.add_metabolites([dst]) + targets[name] = dst + + rxn = Reaction(next_id()) + rxn.name = f"{name} transport, {from_name}-{to_name}" + rxn.bounds = bounds + model.add_reactions([rxn]) + rxn.add_metabolites({src: -1, dst: 1}) + added.append(rxn) + + return added diff --git a/src/raven_python/omics/__init__.py b/src/raven_python/omics/__init__.py new file mode 100644 index 0000000..166b020 --- /dev/null +++ b/src/raven_python/omics/__init__.py @@ -0,0 +1,23 @@ +"""Omics integration — HPA proteomics + RNA-seq parsing and gene-scoring adapters. + +Entry point for tissue-specific (f)tINIT runs. See :mod:`raven_python.omics.hpa`. +""" +from raven_python.omics.hpa import ( + HPA_LEVEL_SCORES, + HPAData, + HPARnaData, + hpa_gene_scores, + parse_hpa, + parse_hpa_rna, + rna_gene_scores, +) + +__all__ = [ + "HPA_LEVEL_SCORES", + "HPAData", + "HPARnaData", + "hpa_gene_scores", + "parse_hpa", + "parse_hpa_rna", + "rna_gene_scores", +] diff --git a/src/raven_python/omics/hpa.py b/src/raven_python/omics/hpa.py new file mode 100644 index 0000000..59caf93 --- /dev/null +++ b/src/raven_python/omics/hpa.py @@ -0,0 +1,190 @@ +"""Human Protein Atlas (HPA) parsers + gene-scoring adapters. + +HPA publishes two datasets per release: a **proteomics** table (``normal_tissue.tsv``) +with per-tissue / per-cell-type *categorical* expression levels (High/Medium/Low/Not +detected) plus reliability flags, and an **RNA-seq** table (``rna_tissue_consensus.tsv`` +/ ``rna_tissue_gtex.tsv``) with per-tissue *TPM* values. Both are returned as tidy +:class:`pandas.DataFrame`\\ s; the scoring adapters delegate the GPR walk to +:func:`raven_python.init.score.score_reactions_from_genes` so there is one source of truth +for reaction scoring. + +Pipeline (typical (f)tINIT entry): + +.. code-block:: python + + hpa = parse_hpa("normal_tissue.tsv") + gene_scores = hpa_gene_scores(hpa, tissue="liver", celltype="hepatocytes") + rxn_scores = score_reactions_from_genes(model, gene_scores) + # → ftinit(prep, rxn_scores, gene_scores=gene_scores, ...) + +or for RNA-seq: + +.. code-block:: python + + rna = parse_hpa_rna("rna_tissue_consensus.tsv") + gene_scores = rna_gene_scores(rna, tissue="liver") # ref = per-gene cross-tissue mean + rxn_scores = score_reactions_from_genes(model, gene_scores) +""" +from __future__ import annotations + +from collections.abc import Mapping +from dataclasses import dataclass +from pathlib import Path + +import pandas as pd + +from raven_python.init.score import gene_scores_from_expression + +# RAVEN's hpaLevelScores defaults (scoreModel.m). HPA reports either antibody-staining +# levels (Strong/Moderate/Weak/Negative) or "APE" classes (High/Medium/Low/Not detected / +# Ascending/Descending/...); the four common categories are mapped here. Unknown levels +# (e.g. "Mixed", "N/A") fall through to NaN and are dropped during scoring. +HPA_LEVEL_SCORES: dict[str, float] = { + "High": 20.0, "Medium": 15.0, "Low": 10.0, "Not detected": -8.0, + "Strong": 20.0, "Moderate": 15.0, "Weak": 10.0, "Negative": -8.0, +} + +_HPA_HEADERS = ("Gene", "Gene name", "Tissue", "Cell type", "Level", "Reliability") +_HPA_RNA_HEADERS = ("Gene", "Gene name", "Tissue") # extra TPM columns follow + + +@dataclass +class HPAData: + """Tidy HPA proteomics data: one row per (gene, tissue, cell type). + + :attr:`df` columns: ``gene_id``, ``gene_name``, ``tissue``, ``celltype``, ``level``, + ``reliability``. ``level`` is the categorical string from HPA; map it to numbers via + :func:`hpa_gene_scores` (or pass a custom ``level_scores``). + """ + + df: pd.DataFrame + + def tissues(self) -> list[str]: + return sorted(self.df["tissue"].unique()) + + def celltypes(self, tissue: str) -> list[str]: + return sorted(self.df.loc[self.df["tissue"] == tissue, "celltype"].unique()) + + +@dataclass +class HPARnaData: + """Tidy HPA RNA-seq data: one row per (gene, tissue) with TPM. + + :attr:`df` columns: ``gene_id``, ``gene_name``, ``tissue``, ``tpm``. + """ + + df: pd.DataFrame + + def tissues(self) -> list[str]: + return sorted(self.df["tissue"].unique()) + + def expression(self, tissue: str) -> dict[str, float]: + """{gene_id: TPM} for ``tissue``. Use this directly with + :func:`raven_python.init.score.gene_scores_from_expression`.""" + sub = self.df.loc[self.df["tissue"] == tissue, ["gene_id", "tpm"]] + return dict(zip(sub["gene_id"], sub["tpm"], strict=True)) + + +def parse_hpa(path: str | Path) -> HPAData: + """Parse an HPA proteomics dump (``normal_tissue.tsv``; version ≥17 format). + + Expected columns (any reasonable delimiter; HPA ships tab-separated): + ``Gene Gene name Tissue Cell type Level Reliability``. Returns an + :class:`HPAData` with one row per (gene, tissue, cell type). + """ + df = pd.read_csv(path, sep=None, engine="python", dtype=str, na_filter=False) + _check_headers(df, _HPA_HEADERS, path) + df = df.rename(columns={ + "Gene": "gene_id", "Gene name": "gene_name", "Tissue": "tissue", + "Cell type": "celltype", "Level": "level", "Reliability": "reliability", + })[["gene_id", "gene_name", "tissue", "celltype", "level", "reliability"]] + return HPAData(df.reset_index(drop=True)) + + +def parse_hpa_rna(path: str | Path) -> HPARnaData: + """Parse an HPA RNA-seq dump. + + Accepts the canonical ≥v17 tidy layout (``Gene Gene name Tissue TPM``, one row per + gene × tissue) or the older wide layout with one TPM column per tissue + (``Gene Gene name TissueA TissueB ...``) — the latter is melted into the same + tidy shape. + """ + df = pd.read_csv(path, sep=None, engine="python", dtype=str, na_filter=False) + if {"Gene", "Gene name", "Tissue", "TPM"}.issubset(df.columns): + df = df.rename(columns={"Gene": "gene_id", "Gene name": "gene_name", + "Tissue": "tissue", "TPM": "tpm"}) + df = df[["gene_id", "gene_name", "tissue", "tpm"]] + elif {"Gene", "Gene name"}.issubset(df.columns): + # Wide layout: tissues are extra columns to melt. + df = df.melt(id_vars=["Gene", "Gene name"], var_name="tissue", value_name="tpm") + df = df.rename(columns={"Gene": "gene_id", "Gene name": "gene_name"}) + else: + raise ValueError(f"{path}: expected Gene/Gene name/Tissue/TPM columns " + f"(got {list(df.columns)})") + df["tpm"] = pd.to_numeric(df["tpm"], errors="coerce") + df = df.dropna(subset=["tpm"]).reset_index(drop=True) + return HPARnaData(df) + + +def hpa_gene_scores( + hpa: HPAData, + tissue: str, + celltype: str | None = None, + *, + level_scores: Mapping[str, float] | None = None, + multiple_celltype: str = "best", +) -> dict[str, float]: + """Numeric gene scores from HPA levels for one ``tissue`` (optionally one ``celltype``). + + Maps HPA's categorical levels to numbers via ``level_scores`` (default + :data:`HPA_LEVEL_SCORES`). Genes absent from the tissue, or whose level is not in the + score table, are omitted from the output (downstream + :func:`score_reactions_from_genes` will then fall back to ``no_gene_score`` for any + reaction whose genes are all absent). + + When several cell types per tissue carry the gene, ``multiple_celltype`` chooses + between ``"best"`` (max score, RAVEN default) and ``"average"`` (mean across cell types). + """ + if multiple_celltype not in ("best", "average"): + raise ValueError(f"multiple_celltype must be 'best' or 'average'; got {multiple_celltype!r}") + scores_table = dict(level_scores) if level_scores is not None else HPA_LEVEL_SCORES + + sub = hpa.df.loc[hpa.df["tissue"] == tissue].copy() + if celltype is not None: + sub = sub.loc[sub["celltype"] == celltype] + sub["score"] = sub["level"].map(scores_table) + sub = sub.dropna(subset=["score"]) # unknown HPA levels drop out (omitted, not -inf) + if sub.empty: + return {} + agg = {"best": "max", "average": "mean"}[multiple_celltype] + return sub.groupby("gene_id")["score"].agg(agg).to_dict() + + +def rna_gene_scores( + rna: HPARnaData, + tissue: str, + *, + reference: Mapping[str, float] | float | None = None, + factor: float = 5.0, + max_score: float = 10.0, + min_score: float = -5.0, +) -> dict[str, float]: + """Numeric gene scores from HPA RNA-seq TPM for one ``tissue``. + + Thin wrapper over :func:`raven_python.init.score.gene_scores_from_expression` (the same + ``5·ln(TPM/reference)``-clamped scoring used elsewhere): selects the tissue, derives + a reference if none is given (per-gene mean TPM across all tissues — RAVEN's default + for ``arrayData.threshold``), and returns ``{gene_id: score}``. + """ + if tissue not in set(rna.df["tissue"]): + raise ValueError(f"tissue {tissue!r} not in dataset (tissues: {rna.tissues()})") + if reference is None: + reference = rna.df.groupby("gene_id")["tpm"].mean().to_dict() + return gene_scores_from_expression(rna.expression(tissue), reference, + factor=factor, max_score=max_score, min_score=min_score) + + +def _check_headers(df: pd.DataFrame, expected: tuple[str, ...], path: str | Path) -> None: + missing = [h for h in expected if h not in df.columns] + if missing: + raise ValueError(f"{path}: missing HPA columns {missing} (got {list(df.columns)})") diff --git a/src/raven_python/reconstruction/__init__.py b/src/raven_python/reconstruction/__init__.py new file mode 100644 index 0000000..a270e2c --- /dev/null +++ b/src/raven_python/reconstruction/__init__.py @@ -0,0 +1 @@ +"""De novo reconstruction from KEGG and protein homology (BLAST/DIAMOND).""" diff --git a/src/raven_python/reconstruction/homology/__init__.py b/src/raven_python/reconstruction/homology/__init__.py new file mode 100644 index 0000000..6ed9748 --- /dev/null +++ b/src/raven_python/reconstruction/homology/__init__.py @@ -0,0 +1,19 @@ +"""Homology-based reconstruction from template models (getModelFromHomology, BLAST/DIAMOND).""" +from raven_python.reconstruction.homology.blast import ( + blast_from_table, + run_blast, + run_diamond, +) +from raven_python.reconstruction.homology.hits import HIT_COLUMNS, make_ortholog_hits, validate_hits +from raven_python.reconstruction.homology.homology import HomologyResult, get_model_from_homology + +__all__ = [ + "HIT_COLUMNS", + "HomologyResult", + "blast_from_table", + "get_model_from_homology", + "make_ortholog_hits", + "run_blast", + "run_diamond", + "validate_hits", +] diff --git a/src/raven_python/reconstruction/homology/blast.py b/src/raven_python/reconstruction/homology/blast.py new file mode 100644 index 0000000..246ddab --- /dev/null +++ b/src/raven_python/reconstruction/homology/blast.py @@ -0,0 +1,146 @@ +"""Run BLAST+ / DIAMOND (or load precomputed hits) into a homology hits table. + +Each producer returns the bidirectional hits DataFrame (``HIT_COLUMNS``) consumed by +:func:`~raven_python.reconstruction.homology.get_model_from_homology`. Binaries are +located via :func:`raven_python.binaries.resolve_binary` (arg → env → PATH → bundled). +""" +from __future__ import annotations + +import io +import subprocess +import tempfile +from collections.abc import Sequence +from pathlib import Path + +import pandas as pd + +from raven_python.binaries import resolve_binary +from raven_python.reconstruction.homology.hits import HIT_COLUMNS, validate_hits + +# Tabular output columns requested from BLAST+/DIAMOND, in order. +_OUTFMT_FIELDS = ["qseqid", "sseqid", "evalue", "pident", "length", "bitscore", "ppos"] +_FIELD_TO_HIT = { + "qseqid": "from_gene", "sseqid": "to_gene", "evalue": "evalue", + "pident": "identity", "length": "align_len", "bitscore": "bitscore", "ppos": "ppos", +} + + +def _parse_tabular(text: str, from_id: str, to_id: str, sep: str) -> pd.DataFrame: + """Parse one BLAST/DIAMOND tabular output into hit rows for one direction.""" + if not text.strip(): + return pd.DataFrame(columns=HIT_COLUMNS) + df = pd.read_csv(io.StringIO(text), sep=sep, names=_OUTFMT_FIELDS, dtype={0: str, 1: str}) + df = df.rename(columns=_FIELD_TO_HIT) + df["from_id"] = from_id + df["to_id"] = to_id + return df[HIT_COLUMNS] + + +def _as_list(x): + return [x] if isinstance(x, (str, Path)) else list(x) + + +def _run(cmd: list[str]) -> str: + proc = subprocess.run(cmd, capture_output=True, text=True) + if proc.returncode != 0: + raise RuntimeError(f"{cmd[0]} failed:\n{proc.stderr.strip()}") + return proc.stdout + + +def run_blast( + organism_id: str, + fasta: str | Path, + model_ids: Sequence[str], + ref_fastas: Sequence[str | Path], + *, + evalue: float = 1e-5, + threads: int = 1, + blastp: str | Path | None = None, + makeblastdb: str | Path | None = None, +) -> pd.DataFrame: + """Bidirectional BLAST+ between an organism and template organisms. + + Returns the hits DataFrame (filtered at + ``evalue``). Requires BLAST+ (`blastp`, `makeblastdb`). + """ + model_ids = list(model_ids) + ref_fastas = _as_list(ref_fastas) + if len(model_ids) != len(ref_fastas): + raise ValueError("model_ids and ref_fastas must have the same length.") + blastp = resolve_binary("blastp", binary=blastp) + makeblastdb = resolve_binary("makeblastdb", binary=makeblastdb) + outfmt = "10 " + " ".join(_OUTFMT_FIELDS) # 10 = CSV + + frames = [] + with tempfile.TemporaryDirectory() as tmp: + tmp = Path(tmp) + + def blastp_dir(query, subject_fasta, from_id, to_id): + db = tmp / f"db_{from_id}_{to_id}" + _run([makeblastdb, "-in", str(subject_fasta), "-dbtype", "prot", "-out", str(db)]) + out = _run([ + blastp, "-query", str(query), "-db", str(db), "-evalue", str(evalue), + "-outfmt", outfmt, "-num_threads", str(threads), + ]) + return _parse_tabular(out, from_id, to_id, sep=",") + + for model_id, ref in zip(model_ids, ref_fastas, strict=True): + # template -> organism, and organism -> template + frames.append(blastp_dir(ref, fasta, model_id, organism_id)) + frames.append(blastp_dir(fasta, ref, organism_id, model_id)) + return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame(columns=HIT_COLUMNS) + + +def run_diamond( + organism_id: str, + fasta: str | Path, + model_ids: Sequence[str], + ref_fastas: Sequence[str | Path], + *, + evalue: float = 1e-5, + threads: int = 1, + sensitivity: str = "--more-sensitive", + diamond: str | Path | None = None, +) -> pd.DataFrame: + """Bidirectional DIAMOND between an organism and template organisms. + + Returns the hits DataFrame. Requires DIAMOND. + """ + model_ids = list(model_ids) + ref_fastas = _as_list(ref_fastas) + if len(model_ids) != len(ref_fastas): + raise ValueError("model_ids and ref_fastas must have the same length.") + diamond = resolve_binary("diamond", binary=diamond) + + frames = [] + with tempfile.TemporaryDirectory() as tmp: + tmp = Path(tmp) + + def diamond_dir(query, subject_fasta, from_id, to_id): + db = tmp / f"db_{from_id}_{to_id}" + _run([diamond, "makedb", "--in", str(subject_fasta), "--db", str(db)]) + cmd = [diamond, "blastp", "--query", str(query), "--db", str(db), + "--evalue", str(evalue), "--outfmt", "6", *_OUTFMT_FIELDS, + "--threads", str(threads)] + if sensitivity: + cmd.append(sensitivity) + return _parse_tabular(_run(cmd), from_id, to_id, sep="\t") + + for model_id, ref in zip(model_ids, ref_fastas, strict=True): + frames.append(diamond_dir(ref, fasta, model_id, organism_id)) + frames.append(diamond_dir(fasta, ref, organism_id, model_id)) + return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame(columns=HIT_COLUMNS) + + +def blast_from_table(source: str | Path | pd.DataFrame) -> pd.DataFrame: + """Load a precomputed homology hits table (CSV path or DataFrame). + + a plain CSV/DataFrame, not Excel. + Must contain the ``HIT_COLUMNS`` columns. + """ + # Force gene-id columns to str: an all-numeric gene-id column (e.g. Entrez ids) + # would otherwise be read as int64 and never match the string gene ids in a model. + df = (source if isinstance(source, pd.DataFrame) + else pd.read_csv(source, dtype={"from_gene": str, "to_gene": str})) + validate_hits(df) + return df[HIT_COLUMNS].copy() diff --git a/src/raven_python/reconstruction/homology/hits.py b/src/raven_python/reconstruction/homology/hits.py new file mode 100644 index 0000000..2f706c3 --- /dev/null +++ b/src/raven_python/reconstruction/homology/hits.py @@ -0,0 +1,64 @@ +"""Homology hits table — the data structure shared across the homology track. + +The hits are one tidy ``pandas.DataFrame`` of bidirectional hits, one row per hit. +This is the currency between the BLAST / DIAMOND wrappers and +:func:`get_model_from_homology`. + +Columns (``HIT_COLUMNS``): +``from_id, to_id`` (organism/model ids), ``from_gene, to_gene`` (the matched +genes; ``from_gene`` is in ``from_id``), and the hit metrics +``evalue, identity, align_len, bitscore, ppos``. +""" +from __future__ import annotations + +from collections.abc import Iterable + +import pandas as pd + +HIT_COLUMNS = [ + "from_id", "to_id", "from_gene", "to_gene", + "evalue", "identity", "align_len", "bitscore", "ppos", +] + + +def make_ortholog_hits( + ortholog_pairs: Iterable[tuple[str, str]], + source_model_id: str, + target_id: str, +) -> pd.DataFrame: + """Build a bidirectional hits table from a predefined ortholog list. + + Each ``(source_gene, target_gene)`` + pair is emitted in both directions with sentinel metrics (evalue 0, + identity 100, align_len 1000, bitscore 1000, ppos 100) so every pair passes + any reasonable filter. Lets a known ortholog mapping feed + :func:`get_model_from_homology` with no BLAST run — also the testing entry + point. + + Parameters + ---------- + ortholog_pairs + Iterable of ``(source_gene, target_gene)`` — source = template/model + organism, target = the organism being built. + source_model_id + ID of the template model the source genes belong to. + target_id + ID of the organism to build a model for (``model_for``). + """ + pairs = [(str(s), str(t)) for s, t in ortholog_pairs] + if not pairs: + raise ValueError("ortholog_pairs is empty.") + + rows = [] + for source_gene, target_gene in pairs: + rows.append((source_model_id, target_id, source_gene, target_gene, 0.0, 100.0, 1000, 1000.0, 100.0)) + rows.append((target_id, source_model_id, target_gene, source_gene, 0.0, 100.0, 1000, 1000.0, 100.0)) + return pd.DataFrame(rows, columns=HIT_COLUMNS) + + +def validate_hits(hits: pd.DataFrame) -> pd.DataFrame: + """Check a hits DataFrame has the required columns; return it unchanged.""" + missing = [c for c in HIT_COLUMNS if c not in hits.columns] + if missing: + raise ValueError(f"hits is missing required columns: {missing}") + return hits diff --git a/src/raven_python/reconstruction/homology/homology.py b/src/raven_python/reconstruction/homology/homology.py new file mode 100644 index 0000000..bc6fa41 --- /dev/null +++ b/src/raven_python/reconstruction/homology/homology.py @@ -0,0 +1,281 @@ +"""Build a draft model from template models + homology hits. + +Key behaviour: + +* clear ``bidirectional`` / ``best_hits_only`` parameters control the hit-filtering + strictness (cleaner than a single overloaded "strictness" knob); +* GPR rewriting works on cobra's AST, not regex; +* explicit ``complex_policy`` decides what happens to AND-subunits that lack an + ortholog (drop, keep, drop-the-reaction); +* best-hit selection is bitscore-based; +* the ortholog map is a DataFrame; provenance is structured. +""" +from __future__ import annotations + +import ast +import warnings +from dataclasses import dataclass, field + +import cobra +import pandas as pd + +from raven_python.manipulation.merge import merge_models +from raven_python.reconstruction.homology.hits import validate_hits + + +@dataclass +class HomologyResult: + """Result of :func:`get_model_from_homology`. + + Attributes + ---------- + model + The draft ``cobra.Model``. + gene_map + ``{model_id: {template_gene: [new_gene, ...]}}`` ortholog mapping used. + """ + + model: cobra.Model + gene_map: dict = field(default_factory=dict) + + +class _Unmapped: + """A GPR leaf gene with no ortholog in the new organism.""" + + __slots__ = ("gene",) + + def __init__(self, gene: str): + self.gene = gene + + +def _rewrite_node(node, ortho: dict, policy: str, model_id: str): + """Rewrite a GPR AST node, substituting template genes by their orthologs. + + Returns a GPR sub-expression string, ``None`` (nothing survives), or an + ``_Unmapped`` for a bare unmapped leaf (the parent decides what to do). + """ + if isinstance(node, ast.Name): + new_genes = ortho.get(node.id) + if new_genes: + return new_genes[0] if len(new_genes) == 1 else "(" + " or ".join(new_genes) + ")" + return _Unmapped(node.id) + + if isinstance(node, ast.BoolOp): + children = [_rewrite_node(c, ortho, policy, model_id) for c in node.values] + if isinstance(node.op, ast.Or): + # An isozyme branch with no ortholog is simply absent. + parts = [c for c in children if isinstance(c, str)] + if not parts: + return None + return parts[0] if len(parts) == 1 else "(" + " or ".join(parts) + ")" + # And: apply the complex policy to unmapped subunits. + parts = [] + for child in children: + if isinstance(child, str): + parts.append(child) + elif isinstance(child, _Unmapped): + if policy == "flag": + parts.append(f"OLD_{model_id}_{child.gene}") + elif policy == "drop": + return None # incomplete complex -> reaction unsupported + # policy == "keep": drop the unmapped subunit + else: # None (a dead sub-branch) + if policy == "drop": + return None + if not parts: + return None + return parts[0] if len(parts) == 1 else "(" + " and ".join(parts) + ")" + + return None + + +def _rewrite_gpr(rxn, ortho: dict, policy: str, model_id: str): + """Return the rewritten GPR string, or None if the reaction is unsupported.""" + if not rxn.gene_reaction_rule: + return None + # A reaction is only transferred if at least one of its genes has an ortholog. + if not any(g.id in ortho for g in rxn.genes): + return None + result = _rewrite_node(rxn.gpr.body, ortho, policy, model_id) + if isinstance(result, str): + return result + return None + + +def _strictness_to_params(strictness, bidirectional, best_hits_only, complex_policy, map_direction): + """Map RAVEN's strictness 1/2/3 onto the clearer parameters (compat).""" + if strictness is None: + return bidirectional, best_hits_only, complex_policy, map_direction + if strictness == 1: + return True, False, complex_policy, map_direction + if strictness == 2: + return False, False, complex_policy, map_direction + if strictness == 3: + return True, True, complex_policy, map_direction + raise ValueError(f"strictness must be 1, 2 or 3, got {strictness}") + + +def _ortholog_map( + hits, model_for, model_ids, *, bidirectional, best_hits_only, score, map_direction, + model_genes, max_evalue, min_align_len, min_identity, +): + """Build {model_id: {template_gene: [new_gene, ...]}} from the hits table.""" + h = hits[ + (hits.evalue <= max_evalue) + & (hits.align_len >= min_align_len) + & (hits.identity >= min_identity) + ] + + if best_hits_only: + ascending = score == "evalue" + h = h.sort_values(score, ascending=ascending) + h = h.groupby(["from_id", "to_id", "from_gene"], sort=False).head(1) + + # Directional views, normalised to (model_id, new_gene, template_gene). + fwd = ( + h[h.from_id == model_for][["to_id", "from_gene", "to_gene"]] + .rename(columns={"to_id": "model_id", "from_gene": "new_gene", "to_gene": "template_gene"}) + ) + rev = ( + h[h.to_id == model_for][["from_id", "from_gene", "to_gene"]] + .rename(columns={"from_id": "model_id", "from_gene": "template_gene", "to_gene": "new_gene"}) + ) + fwd = fwd[fwd.model_id.isin(model_ids)] + rev = rev[rev.model_id.isin(model_ids)] + + if bidirectional: + pairs = fwd.merge(rev, on=["model_id", "new_gene", "template_gene"], how="inner") + elif map_direction == "new_to_old": + pairs = fwd + else: + pairs = rev + pairs = pairs[["model_id", "new_gene", "template_gene"]].drop_duplicates() + if pairs.empty: + return {} + + # Keep only template genes that actually exist in their model. + pairs = pairs[pairs.apply(lambda r: r.template_gene in model_genes.get(r.model_id, ()), axis=1)] + + ortho: dict = {} + for model_id, template_gene, new_gene in zip(pairs.model_id, pairs.template_gene, pairs.new_gene, strict=True): + ortho.setdefault(model_id, {}).setdefault(template_gene, []) + if new_gene not in ortho[model_id][template_gene]: + ortho[model_id][template_gene].append(new_gene) + for per_model in ortho.values(): + for genes in per_model.values(): + genes.sort() + return ortho + + +def _apply_preferred_order(ortho: dict, order: list[str]) -> dict: + """Each new gene's reactions come from the first model (in order) that maps it.""" + winner: dict = {} # new_gene -> winning model_id + for model_id in order: + for new_genes in ortho.get(model_id, {}).values(): + for ng in new_genes: + winner.setdefault(ng, model_id) + pruned: dict = {mid: {} for mid in ortho} + for model_id, per_model in ortho.items(): + for template_gene, new_genes in per_model.items(): + kept = [ng for ng in new_genes if winner.get(ng) == model_id] + if kept: + pruned[model_id][template_gene] = kept + return pruned + + +def get_model_from_homology( + models, + hits: pd.DataFrame, + model_for: str, + *, + preferred_order=None, + bidirectional: bool = True, + best_hits_only: bool = False, + map_direction: str = "new_to_old", + score: str = "bitscore", + complex_policy: str = "flag", + only_genes_in_models: bool = False, + max_evalue: float = 1e-30, + min_align_len: int = 200, + min_identity: float = 40, + strictness: int | None = None, +) -> HomologyResult: + """Build a draft model for ``model_for`` by transferring reactions from templates. + + ``strictness`` (1/2/3) is a legacy alias for ``bidirectional`` / ``best_hits_only``. + """ + if isinstance(models, cobra.Model): + models = [models] + if complex_policy not in ("flag", "keep", "drop"): + raise ValueError(f"complex_policy must be flag/keep/drop, got {complex_policy!r}") + if map_direction not in ("new_to_old", "old_to_new"): + raise ValueError(f"map_direction must be new_to_old/old_to_new, got {map_direction!r}") + bidirectional, best_hits_only, complex_policy, map_direction = _strictness_to_params( + strictness, bidirectional, best_hits_only, complex_policy, map_direction + ) + validate_hits(hits) + + model_by_id = {m.id: m for m in models} + model_ids = list(model_by_id) + model_genes = {mid: {g.id for g in m.genes} for mid, m in model_by_id.items()} + all_model_genes = set().union(*model_genes.values()) if model_genes else set() + + # Sanity: each template should overlap the hits by >=5% of its genes. + for mid, genes in model_genes.items(): + in_hits = genes & (set(hits.from_gene) | set(hits.to_gene)) + if genes and len(in_hits) < 0.05 * len(genes): + warnings.warn( + f"<5% of genes in template '{mid}' appear in the hits table; " + "check that the FASTA and model use the same gene identifiers.", + stacklevel=2, + ) + + if only_genes_in_models: + hits = hits[hits.from_gene.isin(all_model_genes) | hits.to_gene.isin(all_model_genes)] + + ortho = _ortholog_map( + hits, model_for, model_ids, bidirectional=bidirectional, best_hits_only=best_hits_only, + score=score, map_direction=map_direction, + model_genes=model_genes, max_evalue=max_evalue, min_align_len=min_align_len, + min_identity=min_identity, + ) + + order = [str(x) for x in preferred_order] if preferred_order else model_ids + if preferred_order and len(models) > 1: + ortho = _apply_preferred_order(ortho, order) + + # Build a per-template model holding only the transferred reactions with rewritten GPRs. + transferred = [] + for mid in order: + model = model_by_id.get(mid) + if model is None: + continue + per_model = ortho.get(mid, {}) + m = model.copy() + keep: dict[str, str] = {} + for rxn in m.reactions: + new_gpr = _rewrite_gpr(rxn, per_model, complex_policy, mid) + if new_gpr is not None: + keep[rxn.id] = new_gpr + m.remove_reactions([r for r in m.reactions if r.id not in keep], remove_orphans=True) + for rid, gpr in keep.items(): + r = m.reactions.get_by_id(rid) + r.gene_reaction_rule = gpr + r.notes = {"note": "Included by get_model_from_homology", "confidence_score": 2, + "homology_source": mid} + if m.reactions: + transferred.append(m) + + if transferred: + draft = merge_models(transferred, match_by="name") + else: + draft = cobra.Model() + draft.id = model_for + draft.name = "Generated by get_model_from_homology using " + ", ".join(model_ids) + + # Drop OLD_ placeholder genes that ended up orphaned (none survive in OR branches by construction). + orphan_genes = [g for g in draft.genes if not g.reactions] + for g in orphan_genes: + draft.genes.remove(g) + + return HomologyResult(model=draft, gene_map=ortho) diff --git a/src/raven_python/reconstruction/kegg/__init__.py b/src/raven_python/reconstruction/kegg/__init__.py new file mode 100644 index 0000000..5d27602 --- /dev/null +++ b/src/raven_python/reconstruction/kegg/__init__.py @@ -0,0 +1,77 @@ +"""KEGG-based draft reconstruction (getKEGGModelForOrganism and friends). + +Maintainer build steps: 3b.1 download (:mod:`.download`), 3b.2 dump parsing +(:mod:`.parse`), 3b.3 HMM libraries (:mod:`.hmm`, :mod:`.taxonomy`). Runtime: +3b.4 model for a KEGG species (:mod:`.organism`). +""" +from raven_python.reconstruction.kegg.download import ( + download_kegg_dump, + extract_kegg_dump, + fetch_kegg_files, +) +from raven_python.reconstruction.kegg.hmm import ( + build_hmm_library, + build_ko_fastas, + build_ko_hmm, +) +from raven_python.reconstruction.kegg.organism import ( + get_kegg_model_for_organism, + get_kegg_model_for_organism_from_artefacts, +) +from raven_python.reconstruction.kegg.parse import ( + KeggCompound, + KeggKO, + KeggReaction, + build_kegg_tables, + build_reference_model, + parse_kegg_compounds, + parse_kegg_dump, + parse_kegg_kos, + parse_kegg_reactions, + read_kegg_table, + stream_organism_gene_ko, + write_kegg_tables, +) +from raven_python.reconstruction.kegg.query import ( + assign_kos, + get_kegg_model_from_sequences, + get_kegg_model_from_sequences_with_artefacts, + parse_hmmscan_tblout, + run_hmmscan, +) +from raven_python.reconstruction.kegg.taxonomy import ( + organism_domains, + organisms_in_domain, + parse_taxonomy, +) + +__all__ = [ + "KeggCompound", + "KeggKO", + "KeggReaction", + "assign_kos", + "build_hmm_library", + "build_kegg_tables", + "build_ko_fastas", + "build_ko_hmm", + "build_reference_model", + "download_kegg_dump", + "extract_kegg_dump", + "fetch_kegg_files", + "get_kegg_model_for_organism", + "get_kegg_model_for_organism_from_artefacts", + "get_kegg_model_from_sequences", + "get_kegg_model_from_sequences_with_artefacts", + "organism_domains", + "organisms_in_domain", + "parse_hmmscan_tblout", + "parse_kegg_compounds", + "parse_kegg_dump", + "parse_kegg_kos", + "parse_kegg_reactions", + "parse_taxonomy", + "read_kegg_table", + "run_hmmscan", + "stream_organism_gene_ko", + "write_kegg_tables", +] diff --git a/src/raven_python/reconstruction/kegg/assemble.py b/src/raven_python/reconstruction/kegg/assemble.py new file mode 100644 index 0000000..a2b5eb9 --- /dev/null +++ b/src/raven_python/reconstruction/kegg/assemble.py @@ -0,0 +1,82 @@ +"""Shared assembly of a draft model from a KO→genes mapping. + +Both KEGG runtime paths end the same way: having decided which genes belong to +which KO — from organism annotations (3b.4) or from HMM hits (3b.5) — they map +KO→reaction against the gene-free reference model, OR-join the genes into each +reaction's GPR, keep gene-backed reactions (plus spontaneous ones when allowed), +and apply the ``keep*`` quality filters. That common tail lives here. +""" +from __future__ import annotations + +import cobra +import pandas as pd + +_DOMAINS = {"eukaryotes", "prokaryotes"} + + +def flag_set(rxn_flags: pd.DataFrame | None, column: str) -> set[str]: + """Reaction ids whose ``column`` flag is truthy (handles bool or TSV strings).""" + if rxn_flags is None or column not in rxn_flags: + return set() + mask = rxn_flags[column].map(lambda v: str(v).strip().lower() in ("true", "1")) + return set(rxn_flags.loc[mask, "reaction"]) + + +def assemble_model_from_ko_genes( + reference_model: cobra.Model, + ko_reaction: pd.DataFrame, + ko_to_genes: dict[str, list[str]], + *, + rxn_flags: pd.DataFrame | None = None, + keep_spontaneous: bool = True, + keep_undefined_stoich: bool = True, + keep_incomplete: bool = True, + keep_general: bool = False, + model_id: str | None = None, + model_name: str | None = None, + note: str | None = None, +) -> tuple[cobra.Model, dict[str, list[str]]]: + """Build a draft model from a ``{ko: [gene, ...]}`` assignment. + + Returns ``(model, gpr_map)`` where ``gpr_map`` is the kept reactions' gene + lists, so callers can add gene annotations afterwards. + """ + rxn_to_kos: dict[str, set[str]] = {} + for ko, rid in zip(ko_reaction["ko"], ko_reaction["reaction"], strict=True): + rxn_to_kos.setdefault(rid, set()).add(ko) + + spontaneous = flag_set(rxn_flags, "spontaneous") + drop_if = { + "undefined_stoich": (keep_undefined_stoich, flag_set(rxn_flags, "undefined_stoich")), + "incomplete": (keep_incomplete, flag_set(rxn_flags, "incomplete")), + "general": (keep_general, flag_set(rxn_flags, "general")), + } + + gpr_map: dict[str, list[str]] = {} + spontaneous_kept: set[str] = set() + for rxn in reference_model.reactions: + rid = rxn.id + # Quality filters first: dropped even if it would have genes. + if any(not keep_flag and rid in flagged for keep_flag, flagged in drop_if.values()): + continue + genes = sorted({g for ko in rxn_to_kos.get(rid, ()) for g in ko_to_genes.get(ko, ())}) + if genes: + gpr_map[rid] = genes + elif rid in spontaneous and keep_spontaneous: + spontaneous_kept.add(rid) + + keep = set(gpr_map) | spontaneous_kept + model = reference_model.copy() + if model_id is not None: + model.id = model_id + if model_name is not None: + model.name = model_name + model.remove_reactions( + [r for r in model.reactions if r.id not in keep], remove_orphans=True + ) + for rid, genes in gpr_map.items(): + model.reactions.get_by_id(rid).gene_reaction_rule = " or ".join(genes) + if note is not None: + for rid in keep: + model.reactions.get_by_id(rid).notes["note"] = note + return model, gpr_map diff --git a/src/raven_python/reconstruction/kegg/download.py b/src/raven_python/reconstruction/kegg/download.py new file mode 100644 index 0000000..8bb1826 --- /dev/null +++ b/src/raven_python/reconstruction/kegg/download.py @@ -0,0 +1,257 @@ +"""Download and arrange a local KEGG flat-file dump (step 3b.1). + +Maintainer-side, build-time tooling. Ports ``fetch_keggdb.sh`` — fetch the KEGG +FTP source archives, extract them, and lift/concatenate the files that the +parser (3b.2) and HMM build (3b.3) consume — but as **pure Python stdlib** +(``urllib`` + ``tarfile`` + ``gzip`` + ``netrc``). That drops the script's +dependence on ``wget``/``tar``/``gunzip`` (and Cygwin on Windows), so it runs +unchanged on Linux, macOS and Windows. Credential hygiene is kept: a paid KEGG +subscription's username/password are read from ``~/.netrc`` (mode 600), never +passed on the command line. + +Requires an active KEGG FTP subscription. Add to ``~/.netrc``:: + + machine ftp.kegg.net login YOUR_USER password YOUR_PASS + +Typical use (run once per KEGG release):: + + from raven_python.reconstruction.kegg import download_kegg_dump, parse_kegg_dump + download_kegg_dump("keggdb") # -> keggdb/{reaction,compound,ko,...} + parse_kegg_dump("keggdb", "artefacts") # -> reference model + gzipped TSVs + +The arranged dump contains: ``reaction``, ``reaction.lst``, +``reaction_mapformula.lst``, ``compound`` (compound + glycan concatenated), +``compound.inchi``, ``ko``, ``genes.pep`` (eukaryote + prokaryote proteomes +concatenated), and ``taxonomy``. +""" +from __future__ import annotations + +import gzip +import netrc +import shutil +import tarfile +import urllib.request +from pathlib import Path + +KEGG_HOST = "ftp.kegg.net" +BASE_URL = "https://ftp.kegg.net" + +# KEGG FTP paths fetched, mirroring fetch_keggdb.sh. +DEFAULT_FILES: tuple[str, ...] = ( + "kegg/ligand/reaction.tar.gz", + "kegg/ligand/compound.tar.gz", + "kegg/ligand/glycan.tar.gz", + "kegg/genes/ko.tar.gz", + "kegg/genes/fasta/eukaryotes.pep.gz", + "kegg/genes/fasta/prokaryotes.pep.gz", + "kegg/genes/misc/taxonomy", +) + + +# --------------------------------------------------------------------------- # +# Credentials +# --------------------------------------------------------------------------- # +def _resolve_auth( + host: str, + *, + netrc_path: str | Path | None = None, + auth: tuple[str, str] | None = None, +) -> tuple[str, str]: + """Return ``(user, password)`` for ``host`` from ``auth`` or a ``.netrc`` file.""" + if auth is not None: + return auth + path = Path(netrc_path) if netrc_path else Path.home() / ".netrc" + if not path.is_file(): + raise FileNotFoundError( + f"No credentials given and {path} does not exist. Create it (chmod 600) " + f"with a line:\n machine {host} login YOUR_USER password YOUR_PASS" + ) + creds = netrc.netrc(str(path)).authenticators(host) + if not creds: + raise ValueError( + f"No credentials for '{host}' in {path}. Add a line:\n" + f" machine {host} login YOUR_USER password YOUR_PASS" + ) + login, _, password = creds + if not login or not password: + raise ValueError(f"Incomplete credentials for '{host}' in {path}.") + return login, password + + +def _build_opener(base_url: str, user: str, password: str) -> urllib.request.OpenerDirector: + mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm() + mgr.add_password(None, base_url, user, password) + return urllib.request.build_opener( + urllib.request.HTTPBasicAuthHandler(mgr), + urllib.request.HTTPDigestAuthHandler(mgr), + ) + + +# --------------------------------------------------------------------------- # +# Fetch +# --------------------------------------------------------------------------- # +def fetch_kegg_files( + dest: str | Path, + *, + files: tuple[str, ...] = DEFAULT_FILES, + base_url: str = BASE_URL, + host: str = KEGG_HOST, + auth: tuple[str, str] | None = None, + netrc_path: str | Path | None = None, + force: bool = False, + verbose: bool = True, +) -> list[Path]: + """Download the raw KEGG archives into ``dest`` (basenames). Returns the paths. + + Existing files are skipped unless ``force=True`` (the script's ``wget -N`` + intent, simplified to skip-if-present). + """ + user, password = _resolve_auth(host, netrc_path=netrc_path, auth=auth) + opener = _build_opener(base_url, user, password) + dest = Path(dest) + dest.mkdir(parents=True, exist_ok=True) + + out: list[Path] = [] + for path in files: + target = dest / Path(path).name + if target.exists() and not force: + if verbose: + print(f" skip (exists): {target.name}") + out.append(target) + continue + url = f"{base_url.rstrip('/')}/{path.lstrip('/')}" + if verbose: + print(f" fetching {path}") + with opener.open(url) as resp, open(target, "wb") as handle: + shutil.copyfileobj(resp, handle) + out.append(target) + return out + + +# --------------------------------------------------------------------------- # +# Extract / arrange +# --------------------------------------------------------------------------- # +def _gunzip(src: Path, target: Path) -> None: + with gzip.open(src, "rb") as fh, open(target, "wb") as out: + shutil.copyfileobj(fh, out) + + +def _concat(sources: list[Path], target: Path) -> None: + with open(target, "wb") as out: + for src in sources: + with open(src, "rb") as fh: + shutil.copyfileobj(fh, out) + + +def extract_kegg_dump(dest: str | Path) -> dict[str, Path]: + """Extract and arrange the downloaded archives into the flat dump layout. + + Mirrors ``fetch_keggdb.sh``'s extract step: untar the ``*.tar.gz`` archives, + gunzip the ``*.pep.gz`` proteomes, lift the needed files out of their + sub-directories, and concatenate compound+glycan and the two proteomes. + Tar extraction uses the ``data`` filter (no path traversal). Returns a + mapping of logical name -> path for the files produced. + + Network-free, so this is the unit-tested core; ``download_kegg_dump`` chains + :func:`fetch_kegg_files` in front of it. + """ + dest = Path(dest) + + for tar_path in sorted(dest.glob("*.tar.gz")): + with tarfile.open(tar_path) as tar: + tar.extractall(dest, filter="data") + tar_path.unlink() + + for gz_path in sorted(dest.glob("*.gz")): # only the .pep.gz remain + _gunzip(gz_path, gz_path.with_suffix("")) + gz_path.unlink() + + def lift(rel: str, tmp: str) -> Path | None: + src = dest / rel + if src.is_file(): + shutil.move(str(src), str(dest / tmp)) + return dest / tmp + return None + + reaction = lift("reaction/reaction", "_reaction") + lift("reaction/reaction.lst", "reaction.lst") + lift("reaction/reaction_mapformula.lst", "reaction_mapformula.lst") + compound = lift("compound/compound", "_compound") + lift("compound/compound.inchi", "compound.inchi") + glycan = lift("glycan/glycan", "_glycan") + ko = lift("ko/ko", "_ko") + + for subdir in ("reaction", "compound", "glycan", "ko"): + path = dest / subdir + if path.is_dir(): + shutil.rmtree(path) + + missing = [n for n, p in (("reaction", reaction), ("compound", compound), ("ko", ko)) if p is None] + if missing: + raise FileNotFoundError( + f"KEGG archives did not yield required file(s): {missing}. " + f"Check that the source .tar.gz archives are present in {dest}." + ) + + shutil.move(str(reaction), str(dest / "reaction")) + shutil.move(str(ko), str(dest / "ko")) + if glycan is not None: + _concat([compound, glycan], dest / "compound") + compound.unlink() + glycan.unlink() + else: + shutil.move(str(compound), str(dest / "compound")) + + peps = [p for p in (dest / "eukaryotes.pep", dest / "prokaryotes.pep") if p.is_file()] + if peps: + _concat(peps, dest / "genes.pep") + for pep in peps: + pep.unlink() + + result: dict[str, Path] = {} + for name in ( + "reaction", + "reaction.lst", + "reaction_mapformula.lst", + "compound", + "compound.inchi", + "ko", + "genes.pep", + "taxonomy", + ): + path = dest / name + if path.is_file(): + result[name] = path + return result + + +def download_kegg_dump( + dest: str | Path, + *, + files: tuple[str, ...] = DEFAULT_FILES, + base_url: str = BASE_URL, + host: str = KEGG_HOST, + auth: tuple[str, str] | None = None, + netrc_path: str | Path | None = None, + force: bool = False, + verbose: bool = True, +) -> dict[str, Path]: + """Fetch and arrange a complete KEGG dump into ``dest``. + + Convenience wrapper chaining :func:`fetch_kegg_files` and + :func:`extract_kegg_dump`. Returns the logical-name -> path mapping of the + arranged dump, ready for :func:`raven_python.reconstruction.kegg.parse_kegg_dump`. + """ + fetch_kegg_files( + dest, + files=files, + base_url=base_url, + host=host, + auth=auth, + netrc_path=netrc_path, + force=force, + verbose=verbose, + ) + if verbose: + print(">>> Extracting and arranging KEGG dump...") + return extract_kegg_dump(dest) diff --git a/src/raven_python/reconstruction/kegg/hmm.py b/src/raven_python/reconstruction/kegg/hmm.py new file mode 100644 index 0000000..0e210b6 --- /dev/null +++ b/src/raven_python/reconstruction/kegg/hmm.py @@ -0,0 +1,453 @@ +"""Build per-KO HMM libraries from KEGG sequences (step 3b.3, maintainer-side). + +Ports RAVEN ``constructMultiFasta`` plus the clustering/alignment/training stages +of ``getKEGGModelForOrganism``. Run once per KEGG release to produce the +``prok90`` / ``euk90`` HMM libraries that the de-novo query path (3b.5) searches. + +Per KO, within one domain (prokaryote / eukaryote): + +1. **Multi-FASTA** — gather the member genes' sequences from ``genes.pep`` + (:func:`build_ko_fastas`). +2. **CD-HIT** — dereplicate near-identical sequences (default 90 % identity). +3. **MAFFT** — multiple-sequence alignment (``--auto --anysymbol``). +4. **hmmbuild** — train the profile HMM. + +Finally the per-KO HMMs are concatenated and ``hmmpress``-ed into a single searchable +library: a single ``hmmscan`` against the pressed database replaces a per-KO sweep with +``hmmsearch``. + +The pure parts (FASTA indexing/grouping, command construction, CD-HIT ``-n`` +choice) are unit-tested; running the binaries needs HMMER/MAFFT/CD-HIT, located +via :func:`raven_python.binaries.resolve_binary`. +""" +from __future__ import annotations + +import functools +import logging +import os +import shutil +import subprocess +import tempfile +import time +from pathlib import Path + +import pandas as pd + +from raven_python.binaries import resolve_binary +from raven_python.reconstruction.kegg.taxonomy import organisms_in_domain + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- # +# Step 1 — per-KO multi-FASTA (constructMultiFasta) +# --------------------------------------------------------------------------- # +def _full_id(organism: str, gene: str) -> str: + """The genes.pep header key for a gene, i.e. ``organism:gene``.""" + return f"{organism}:{gene}" + + +def _index_fasta(path: str | Path, wanted: set[str]) -> dict[str, tuple[int, int]]: + """Map each wanted record id to its ``(start, end)`` byte span in ``path``. + + The record id is the first whitespace-delimited token of the ``>`` header. + One streaming pass; only wanted ids are kept (memory stays small). + """ + index: dict[str, tuple[int, int]] = {} + cur_id: str | None = None + cur_start = 0 + pos = 0 + with open(path, "rb") as handle: + for line in handle: + if line.startswith(b">"): + if cur_id is not None and cur_id in wanted: + index[cur_id] = (cur_start, pos) + cur_id = line[1:].split(None, 1)[0].decode() + cur_start = pos + pos += len(line) + if cur_id is not None and cur_id in wanted: + index[cur_id] = (cur_start, pos) + return index + + +def build_ko_fastas( + organism_gene_ko: pd.DataFrame, + genes_pep: str | Path, + out_dir: str | Path, + *, + organisms: set[str] | None = None, +) -> dict[str, Path]: + """Write one ``.fa`` per KO with its member genes' sequences. + + but with a stdlib offset index instead + of the Java-hashtable byte scan. ``organisms`` restricts to a domain's + organism codes (for the prok/euk split). Empty KOs are skipped (no file). + Returns ``{ko: path}`` for the files written. + """ + out_dir = Path(out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + rows = organism_gene_ko + if organisms is not None: + rows = rows[rows["organism"].isin(organisms)] + + ko_to_ids: dict[str, list[str]] = {} + wanted: set[str] = set() + for organism, gene, ko in zip(rows["organism"], rows["gene"], rows["ko"], strict=True): + fid = _full_id(organism, gene) + ko_to_ids.setdefault(ko, []).append(fid) + wanted.add(fid) + + index = _index_fasta(genes_pep, wanted) + + written: dict[str, Path] = {} + with open(genes_pep, "rb") as src: + for ko, ids in ko_to_ids.items(): + present = sorted({i for i in ids if i in index}) + if not present: + continue + path = out_dir / f"{ko}.fa" + with open(path, "wb") as out: + for fid in present: + start, end = index[fid] + src.seek(start) + out.write(src.read(end - start)) + written[ko] = path + return written + + +# --------------------------------------------------------------------------- # +# Steps 2-4 — cluster, align, train (one KO) +# --------------------------------------------------------------------------- # +def _cdhit_word_size(seq_identity: float) -> str: + """CD-HIT ``-n`` word size for a given identity threshold (per CD-HIT guide).""" + if not 0.4 < seq_identity <= 1.0: + raise ValueError("seq_identity must be in (0.4, 1.0] (or -1 to skip CD-HIT).") + if seq_identity > 0.7: + return "5" + if seq_identity > 0.6: + return "4" + if seq_identity > 0.5: + return "3" + return "2" + + +def _count_sequences(fasta: Path) -> int: + with open(fasta, "rb") as fh: + return sum(1 for line in fh if line.startswith(b">")) + + +def _fasta_stats(fasta: Path) -> tuple[int, int]: + """Return ``(sequence_count, total_residues)`` in one pass.""" + n = residues = 0 + with open(fasta, "rb") as fh: + for line in fh: + if line.startswith(b">"): + n += 1 + else: + residues += len(line.strip()) + return n, residues + + +def _cdhit_cmd(cdhit: str, inp: Path, out: Path, seq_identity: float, threads: int) -> list[str]: + return [ + cdhit, "-i", str(inp), "-o", str(out), + "-c", str(seq_identity), "-n", _cdhit_word_size(seq_identity), + "-M", "2000", "-T", str(threads), + ] + + +# MAFFT uses fast progressive FFT-NS-2 until an alignment is large enough to +# threaten memory, then switches to memory-light PartTree (which keeps all +# sequences; only the guide tree is approximated). +# +# Peak FFT-NS-2 RSS is driven by the progressive-alignment DP work, ~ n_seqs × +# (mean length)^2 (equivalently residues^2 / n_seqs) — NOT residue count alone: +# a few hundred long proteins cost far more than the same residues spread over +# many short ones. Empirical fit (real KEGG sequences, 12 threads): +# RSS_GB ≈ _MAFFT_GB_PER_COST × (n_seqs × mean_len^2) +# Measured (residues, n_seqs, RSS): 250k/266/0.67, 500k/534/1.25, 1.0M/1066/3.16, +# 1.5M/1624/5.73, and K12047 941k/452 (mean len 2082) which OOM'd >7 GB — its +# cost 1.96e9 is the largest of all, hence the length-aware metric. +_MAFFT_GB_PER_COST = 4.2e-9 # GB per unit of (n_seqs × mean_len^2); conservative upper bound +_MAFFT_MEMORY_OVERHEAD_GB = 2.5 # RAM not for MAFFT (OS + WSL2 + Python); WSL total overcounts +_MEMORY_SAFETY = 0.65 # leave headroom; never budget MAFFT to the brink +_DEFAULT_COST_BUDGET = 5e8 # fallback DP-cost budget when total memory can't be detected +_LOW_MEMORY_BYTES = 16 * 1024**3 # below this, warn that the budget is conservative + + +def _total_memory_bytes() -> int | None: + try: + return os.sysconf("SC_PHYS_PAGES") * os.sysconf("SC_PAGE_SIZE") + except (AttributeError, ValueError, OSError): + return None + + +def _alignment_cost(n_seqs: int, residues: int) -> float: + """FFT-NS-2 memory proxy: ``n_seqs × mean_len^2`` = ``residues^2 / n_seqs``.""" + return residues * residues / n_seqs if n_seqs else 0.0 + + +@functools.lru_cache(maxsize=1) +def _auto_cost_budget() -> float: + """Max FFT-NS-2 DP-cost (``n_seqs × mean_len^2``) before switching to PartTree. + + Derived from available RAM via the measured memory model; above it, an + alignment is predicted to exceed a safe fraction of the RAM left for MAFFT. + Computed and logged once; warns on low-memory hosts (more KOs then use the + approximate PartTree). + """ + total = _total_memory_bytes() + if total is None: + logger.warning( + "Could not detect system memory; using default MAFFT cost budget %.2e. " + "Pass parttree_residues to override.", _DEFAULT_COST_BUDGET, + ) + return _DEFAULT_COST_BUDGET + total_gb = total / 1024**3 + mafft_gb = max(total_gb - _MAFFT_MEMORY_OVERHEAD_GB, 0.5) + budget = _MEMORY_SAFETY * mafft_gb / _MAFFT_GB_PER_COST + logger.info( + "MAFFT DP-cost budget %.2e auto-set from %.1f GB RAM (~%.1f GB for MAFFT)", + budget, total_gb, mafft_gb, + ) + if total < _LOW_MEMORY_BYTES: + logger.warning( + "Limited memory (%.1f GB total): MAFFT cost budget set conservatively to " + "%.2e, so more (especially long-protein) KOs use the approximate PartTree " + "alignment. With more RAM, fewer would.", total_gb, budget, + ) + return budget + + +def _mafft_cmd( + mafft: str, inp: Path, threads: int, *, fast: bool = True, parttree: bool = False +) -> list[str]: + """Build the MAFFT command. + + ``fast`` selects FFT-NS-2 (``--retree 2 --maxiterate 0``) — fast progressive + alignment, the right trade-off for building profile HMMs — instead of + ``--auto`` (which picks slow iterative refinement on medium/large inputs). + ``parttree`` adds MAFFT's PartTree approximation for very large inputs. + """ + cmd = [mafft] + if parttree: + cmd += ["--retree", "2", "--parttree"] + elif fast: + cmd += ["--retree", "2", "--maxiterate", "0"] + else: + cmd += ["--auto"] + cmd += ["--anysymbol", "--thread", str(threads), str(inp)] + return cmd + + +def _hmmbuild_cmd( + hmmbuild: str, out_hmm: Path, aligned: Path, threads: int, name: str | None = None +) -> list[str]: + cmd = [hmmbuild, "--cpu", str(threads)] + if name: # name the profile after its KO so hmmscan targets are KO ids + cmd += ["-n", name] + cmd += [str(out_hmm), str(aligned)] + return cmd + + +def _run(cmd: list[str], *, stdout_path: Path | None = None) -> str: + """Run a command; optionally redirect stdout to a file. Raises on failure.""" + if stdout_path is not None: + with open(stdout_path, "w") as out: + proc = subprocess.run(cmd, stdout=out, stderr=subprocess.PIPE, text=True) + stderr = proc.stderr or "" + else: + proc = subprocess.run(cmd, capture_output=True, text=True) + stderr = proc.stderr or "" + if proc.returncode != 0: + raise RuntimeError(f"{Path(cmd[0]).name} failed:\n{stderr.strip()}") + return stderr + + +def _staged_run( + cmd: list[str], *, label: str, stage: str, verbose: bool, + stdout_path: Path | None = None, log: bool = True, +) -> float: + """Run a stage's command; log one completion line per stage (when verbose). + + At INFO (when ``log``): a single ``[KO] stage: done in X.Xs`` line — the + ``stage`` descriptor already names the tool/mode and any seq/res/cost context, + so the timing is just appended rather than repeated on a second line. The + tool's own stderr (MAFFT/CD-HIT/hmmbuild progress) is logged at DEBUG. Pass + ``log=False`` to suppress the line so the caller can fold the timing into its + own message. Returns the stage's wall-clock seconds. + """ + start = time.perf_counter() + stderr = _run(cmd, stdout_path=stdout_path) + elapsed = time.perf_counter() - start + if verbose: + if log: + logger.info("[%s] %s: done in %.1fs", label, stage, elapsed) + if stderr.strip(): + logger.debug("[%s] %s output:\n%s", label, stage, stderr.strip()) + return elapsed + + +def build_ko_hmm( + ko_fasta: str | Path, + out_hmm: str | Path, + *, + seq_identity: float = 0.9, + parttree_residues: int | None = None, + threads: int = 1, + fast: bool = True, + verbose: bool = False, + cdhit: str | Path | None = None, + mafft: str | Path | None = None, + hmmbuild: str | Path | None = None, +) -> Path: + """Cluster, align and train a profile HMM for one KO's multi-FASTA. + + Single-sequence KOs skip CD-HIT/MAFFT (a lone sequence is its own alignment). + ``seq_identity=-1`` skips CD-HIT. All (deduplicated) sequences are kept — + memory on large KOs is bounded by switching MAFFT to PartTree, not by + dropping sequences. ``fast`` uses MAFFT FFT-NS-2 (fast progressive) rather + than ``--auto``'s slow iterative refinement. MAFFT switches to memory-light + PartTree once an alignment is predicted to be too memory-heavy: by default from + its **DP cost** (``n_seqs × mean_len²`` — long proteins cost far more than the + same residue count in short ones) against a RAM-derived budget + (:func:`_auto_cost_budget`). Passing ``parttree_residues`` overrides this with a + simple residue-count cutoff. + ``verbose`` logs (via the ``logging`` module, INFO/DEBUG) which tool is running + for this KO, sequence counts at each stage, timings, and the tools' own + output. Returns ``out_hmm``. + """ + ko_fasta = Path(ko_fasta) + out_hmm = Path(out_hmm) + label = out_hmm.stem + out_hmm.parent.mkdir(parents=True, exist_ok=True) + n = _count_sequences(ko_fasta) + if n == 0: + raise ValueError(f"{ko_fasta} contains no sequences.") + if verbose: + logger.info("[%s] start: %d sequences", label, n) + + hmmbuild = resolve_binary("hmmbuild", binary=hmmbuild) + with tempfile.TemporaryDirectory() as tmp: + tmp = Path(tmp) + if n == 1: + if verbose: + logger.info("[%s] single sequence: skipping CD-HIT/MAFFT", label) + aligned = ko_fasta # trivially aligned + else: + clustered = ko_fasta + cdhit_elapsed: float | None = None + if seq_identity != -1: + clustered = tmp / "clustered.fa" + cdhit_elapsed = _staged_run( + _cdhit_cmd( + resolve_binary("cd-hit", binary=cdhit), ko_fasta, clustered, + seq_identity, threads, + ), + label=label, stage=f"CD-HIT ({seq_identity})", verbose=verbose, log=False, + ) + n_clustered, residues = _fasta_stats(clustered) + if verbose and cdhit_elapsed is not None: + logger.info( + "[%s] CD-HIT (%s): %d -> %d sequences in %.1fs", + label, seq_identity, n, n_clustered, cdhit_elapsed, + ) + aligned = tmp / "aligned.fa" + if n_clustered == 1: + if verbose: + logger.info("[%s] one sequence after CD-HIT: skipping MAFFT", label) + shutil.copyfile(clustered, aligned) # MAFFT can't align a single seq + else: + # PartTree once the alignment is too memory-heavy. Default: its DP + # cost (n_seqs × mean_len^2) vs a RAM-derived budget — length-aware, + # so long-protein KOs (few seqs, huge residues) route correctly. + # parttree_residues, if given, overrides with a residue-count cutoff. + cost = _alignment_cost(n_clustered, residues) + if parttree_residues is None: + parttree = cost > _auto_cost_budget() + else: + parttree = residues > parttree_residues + _staged_run( + _mafft_cmd( + resolve_binary("mafft", binary=mafft), clustered, threads, + fast=fast, parttree=parttree, + ), + label=label, + stage=f"MAFFT {'PartTree' if parttree else 'FFT-NS-2' if fast else 'auto'} " + f"({n_clustered} seqs, {residues} res, cost {cost:.2e})", + verbose=verbose, + stdout_path=aligned, + ) + _staged_run( + _hmmbuild_cmd(hmmbuild, out_hmm, aligned, threads, name=label), + label=label, stage="hmmbuild", verbose=verbose, + ) + if verbose: + logger.info("[%s] complete -> %s", label, out_hmm) + return out_hmm + + +# --------------------------------------------------------------------------- # +# Orchestration — a full domain library +# --------------------------------------------------------------------------- # +def build_hmm_library( + organism_gene_ko: pd.DataFrame, + genes_pep: str | Path, + taxonomy: str | Path, + out_dir: str | Path, + *, + domain: str, + seq_identity: float = 0.9, + parttree_residues: int | None = None, + threads: int = 1, + fast: bool = True, + verbose: bool = False, + press: bool = True, + cdhit: str | Path | None = None, + mafft: str | Path | None = None, + hmmbuild: str | Path | None = None, + hmmpress: str | Path | None = None, +) -> dict[str, Path | list[Path]]: + """Build a domain (``"prokaryotes"``/``"eukaryotes"``) HMM library. + + Restricts genes to the domain's organisms (from ``taxonomy``), builds a + multi-FASTA and a profile HMM per KO under ``out_dir``, and (if ``press``) + concatenates them into ``out_dir/library.hmm`` and ``hmmpress``-es it for fast + ``hmmscan`` querying. Returns ``{"hmms": [...], "library": path | None}``. + + Heavy and binary-dependent — intended for the maintainer, run once per KEGG + release. Skips KOs that already have an ``.hmm`` (resumable). + """ + out_dir = Path(out_dir) + fasta_dir = out_dir / "fasta" + hmm_dir = out_dir / "hmms" + hmm_dir.mkdir(parents=True, exist_ok=True) + + organisms = organisms_in_domain(taxonomy, domain) + if not organisms: + raise ValueError(f"No organisms found for domain {domain!r} in {taxonomy}.") + + ko_fastas = build_ko_fastas(organism_gene_ko, genes_pep, fasta_dir, organisms=organisms) + + hmms: list[Path] = [] + for ko, fasta in ko_fastas.items(): + out_hmm = hmm_dir / f"{ko}.hmm" + if not out_hmm.exists(): + build_ko_hmm( + fasta, out_hmm, seq_identity=seq_identity, + parttree_residues=parttree_residues, threads=threads, fast=fast, + verbose=verbose, cdhit=cdhit, mafft=mafft, hmmbuild=hmmbuild, + ) + hmms.append(out_hmm) + + library: Path | None = None + if press and hmms: + library = out_dir / "library.hmm" + with open(library, "wb") as out: + for hmm in sorted(hmms): + with open(hmm, "rb") as fh: + shutil.copyfileobj(fh, out) + _run([resolve_binary("hmmpress", binary=hmmpress), "-f", str(library)]) + + return {"hmms": hmms, "library": library} diff --git a/src/raven_python/reconstruction/kegg/organism.py b/src/raven_python/reconstruction/kegg/organism.py new file mode 100644 index 0000000..9f30575 --- /dev/null +++ b/src/raven_python/reconstruction/kegg/organism.py @@ -0,0 +1,153 @@ +"""Build a draft model for a KEGG species from the reference artefacts (step 3b.4). + +Ports the **organism-ID** path of RAVEN ``getKEGGModelForOrganism`` (the branch +taken when no FASTA file is given). For an organism already annotated in KEGG it +needs no homology search: take the organism's gene↔KO assignments, map KO→reaction +against the gene-free reference model, OR-join the organism's genes into each +reaction's GPR, and keep the reactions that end up with genes (plus spontaneous +reactions, optionally). The HMM/FASTA path is step 3b.5 (:mod:`.query`). + +Consumes the 3b.2 artefacts: the gene-free reference ``cobra.Model`` plus the +``ko_reaction``, ``organism_gene_ko`` and ``rxn_flags`` tables. The KO→reaction +mapping is taken from the ``ko_reaction`` table (a lossless published artefact) +rather than from the reference model's annotations, so it does not depend on KEGG +annotations surviving an SBML round-trip. + +Domain mode (``organism_id`` = ``"eukaryotes"``/``"prokaryotes"``) keeps the genes +of every organism in that domain; it needs the KEGG ``taxonomy`` file. Unlike +RAVEN, this uses the domain classification directly rather than the full +``getPhylDist`` distance matrix — the matrix existed for per-organism HMM +subsampling, which our fixed prok90/euk90 libraries (3b.3) make unnecessary. +""" +from __future__ import annotations + +from pathlib import Path + +import cobra +import pandas as pd + +from raven_python.io.yaml import read_yaml_model +from raven_python.reconstruction.kegg.assemble import _DOMAINS, assemble_model_from_ko_genes +from raven_python.reconstruction.kegg.parse import read_kegg_table +from raven_python.reconstruction.kegg.taxonomy import organisms_in_domain + +_NOTE = "Included by get_kegg_model_for_organism (no HMMs)" + + +def get_kegg_model_for_organism( + organism_id: str, + reference_model: cobra.Model, + ko_reaction: pd.DataFrame, + organism_gene_ko: pd.DataFrame, + *, + rxn_flags: pd.DataFrame | None = None, + taxonomy: str | Path | None = None, + keep_spontaneous: bool = True, + keep_undefined_stoich: bool = True, + keep_incomplete: bool = True, + keep_general: bool = False, +) -> cobra.Model: + """Reconstruct a draft model for a KEGG species from its KO annotations. + + Parameters + ---------- + organism_id + Three/four-letter KEGG organism code (e.g. ``"eco"``), or + ``"eukaryotes"``/``"prokaryotes"`` for a whole-domain model (requires + ``taxonomy``). Matched case-insensitively. + reference_model + The gene-free KEGG reference model (from :func:`build_reference_model`). + ko_reaction, organism_gene_ko, rxn_flags + The relational tables from :func:`build_kegg_tables` (or read back with + :func:`read_kegg_table`). + taxonomy + Path to the KEGG ``taxonomy`` file; required only for domain mode. + keep_spontaneous, keep_undefined_stoich, keep_incomplete, keep_general + Quality filters (RAVEN's ``keep*``). A reaction flagged in ``rxn_flags`` + is dropped unless its keep flag is set; this takes precedence over having + genes. Spontaneous reactions are additionally kept *without* genes when + ``keep_spontaneous`` is true. + + Returns + ------- + cobra.Model + A copy of the reference restricted to the organism's reactions, with GPRs + built and ``kegg.genes`` annotations on the genes. + """ + org = organism_id.lower() + if org in _DOMAINS: + if taxonomy is None: + raise ValueError( + f"Domain mode ({organism_id!r}) needs the KEGG taxonomy file; " + "pass taxonomy=." + ) + members = organisms_in_domain(taxonomy, org) + rows = organism_gene_ko[organism_gene_ko["organism"].str.lower().isin(members)] + else: + known = set(organism_gene_ko["organism"].str.lower()) + if org not in known: + raise ValueError( + f"Organism '{organism_id}' has no genes in organism_gene_ko. " + f"Provide a KEGG species code present in the table." + ) + rows = organism_gene_ko[organism_gene_ko["organism"].str.lower() == org] + + ko_to_genes: dict[str, list[str]] = {} + for org_code, gene, ko in zip(rows["organism"], rows["gene"], rows["ko"], strict=True): + # In domain mode genes from different organisms can share a bare id; + # qualify with the organism so they stay distinct. + gene_id = gene if org not in _DOMAINS else f"{org_code.lower()}:{gene}" + ko_to_genes.setdefault(ko, []).append(gene_id) + + model, _ = assemble_model_from_ko_genes( + reference_model, + ko_reaction, + ko_to_genes, + rxn_flags=rxn_flags, + keep_spontaneous=keep_spontaneous, + keep_undefined_stoich=keep_undefined_stoich, + keep_incomplete=keep_incomplete, + keep_general=keep_general, + model_id=organism_id, + model_name=f"Generated by get_kegg_model_for_organism for {organism_id}", + note=_NOTE, + ) + for gene in model.genes: + # Species mode: bare gene id -> organism:gene. Domain mode: already + # organism-qualified. + value = gene.id if ":" in gene.id else f"{org}:{gene.id}" + gene.annotation["kegg.genes"] = value + return model + + +def get_kegg_model_for_organism_from_artefacts( + organism_id: str, + artefact_dir: str | Path | None = None, + *, + version: str | None = None, + **kwargs, +) -> cobra.Model: + """Load the published 3b.2 artefacts from ``artefact_dir`` and build the model. + + Reads ``reference_model.yml.gz`` and the ``ko_reaction``/``organism_gene_ko``/ + ``rxn_flags`` gzipped-TSV tables, then calls :func:`get_kegg_model_for_organism`. + If ``artefact_dir`` is ``None`` the published artefacts are fetched/cached via + :func:`raven_python.data.ensure_kegg_data` (``version`` selects the release). + """ + if artefact_dir is None: + from raven_python.data import ensure_kegg_data + + artefact_dir = ensure_kegg_data(version=version) + artefact_dir = Path(artefact_dir) + reference_model = read_yaml_model(artefact_dir / "reference_model.yml.gz") + ko_reaction = read_kegg_table(artefact_dir / "ko_reaction.tsv.gz") + organism_gene_ko = read_kegg_table(artefact_dir / "organism_gene_ko.tsv.xz") + rxn_flags = read_kegg_table(artefact_dir / "rxn_flags.tsv.gz") + return get_kegg_model_for_organism( + organism_id, + reference_model, + ko_reaction, + organism_gene_ko, + rxn_flags=rxn_flags, + **kwargs, + ) diff --git a/src/raven_python/reconstruction/kegg/parse.py b/src/raven_python/reconstruction/kegg/parse.py new file mode 100644 index 0000000..3ecd6f4 --- /dev/null +++ b/src/raven_python/reconstruction/kegg/parse.py @@ -0,0 +1,578 @@ +"""Parse a local KEGG flat-file dump into a reference model + relational tables. + +Maintainer-side, build-time tooling. Produces the published raven_python KEGG artefacts: + +* a **gene-free reference GEM** (reactions + metabolites only) as a ``cobra.Model``; +* minimal **relational tables** (``pandas.DataFrame``) written as gzipped TSV — + ``ko_reaction``, ``ko_names``, ``organism_gene_ko`` (the large one), and + ``rxn_flags`` (spontaneous / undefined-stoich / incomplete / general). + +Genes live only in ``organism_gene_ko``; per-organism GPRs are built at runtime +(3b.4/3b.5), so the reference model stays small. + +Improvements over the RAVEN port (logged in IMPROVEMENTS.md): + +* **K1** — equations are read from each reaction entry's own ``EQUATION`` field, + dropping RAVEN's fragile dependence on ``reaction.lst`` being in the exact same + line order as ``reaction``. +* **K2** — undefined-stoichiometry terms (``n C00001``, ``(n+1) C00002``) keep + their real compound id with coefficient 1 and the reaction is *flagged*, rather + than minting ``"n C00001"`` pseudo-metabolites and renaming them ``undefined_N``. +* **K3** — quality labels become a tidy boolean ``rxn_flags`` table instead of + free-text appended to ``rxnNotes``. + +The KEGG flat-file format: each entry is a block of lines terminated by ``///``; +a field label occupies columns 1-12, continuation lines are indented 12 spaces. +""" +from __future__ import annotations + +import gzip +import heapq +import lzma +import re +import tempfile +from collections.abc import Iterator +from dataclasses import dataclass, field +from pathlib import Path + +import cobra +import pandas as pd + +from raven_python.io.yaml import write_yaml_model + +# A KEGG entry id is the first token after the 12-char ENTRY label (6 chars: +# R00010, C00001, K01194, ...). +_ID_LEN = 6 +_LABEL_WIDTH = 12 + +# Compound token inside an equation, optionally a glycan (G) or drug (D); we also +# tolerate trailing polymer suffixes like "C00404(n)" by matching the stem. +_MET_TOKEN = re.compile(r"^([CGD]\d{5})") +_NUMERIC = re.compile(r"^\d+(\.\d+)?$") + + +# --------------------------------------------------------------------------- # +# Generic flat-file reader +# --------------------------------------------------------------------------- # +def _iter_entries(path: str | Path) -> Iterator[dict[str, list[str]]]: + """Yield one ``{field_label: [value_lines]}`` dict per ``///``-delimited entry. + + Field labels (columns 1-12) key a list of their value lines in file order; + continuation lines (12 leading spaces) append to the current field. + """ + entry: dict[str, list[str]] = {} + current: str | None = None + with open(path, encoding="utf-8") as handle: + for raw in handle: + line = raw.rstrip("\n") + if line.startswith("///"): + if entry: + yield entry + entry, current = {}, None + continue + if not line.strip(): + continue + label = line[:_LABEL_WIDTH].strip() + value = line[_LABEL_WIDTH:].rstrip() + if label: + current = label + entry.setdefault(current, []).append(value) + elif current is not None: + entry[current].append(value) + if entry: # tolerate a missing final '///' + yield entry + + +# --------------------------------------------------------------------------- # +# Reactions +# --------------------------------------------------------------------------- # +@dataclass +class KeggReaction: + """A reaction parsed from the KEGG ``reaction`` flat file.""" + + id: str + name: str = "" + equation: str = "" + reversible: bool = True + eccodes: list[str] = field(default_factory=list) + kos: list[str] = field(default_factory=list) + pathways: list[str] = field(default_factory=list) + spontaneous: bool = False + incomplete: bool = False + general: bool = False + undefined_stoich: bool = False + # Cached stoichiometry from ``_parse_equation(equation)``: populated by + # :func:`parse_kegg_reactions` so :func:`build_reference_model` reuses the + # parse instead of repeating it (KEGG has ~12k reactions; a full redundant + # parse cost a noticeable chunk of the build). + stoichiometry: dict[str, float] = field(default_factory=dict) + + +def _first_id(lines: list[str]) -> str: + return lines[0][:_ID_LEN].strip() if lines else "" + + +def _comment_flags(rxn: KeggReaction, comment: str) -> None: + text = comment.upper() + rxn.spontaneous = "SPONTANEOUS" in text + rxn.incomplete = any(w in text for w in ("INCOMPLETE", "ERRONEOUS", "UNCLEAR")) + rxn.general = "GENERAL REACTION" in text + + +def _parse_equation(equation: str) -> tuple[dict[str, float], bool, bool]: + """Parse a KEGG equation into ``({met_id: coef}, reversible, undefined_stoich)``. + + Reactants get negative coefficients, products positive. Non-numeric + coefficients (``n``, ``(n+1)``, ``2n``) are treated as 1.0 and flag the + reaction as having undefined stoichiometry (improvement K2). + """ + reversible = "<=>" in equation + parts = re.split(r"\s(?:<=>|=>|<=)\s", equation, maxsplit=1) + lhs, rhs = (parts + ["", ""])[:2] + + stoich: dict[str, float] = {} + undefined = False + for side, sign in ((lhs, -1.0), (rhs, 1.0)): + for term in filter(None, (t.strip() for t in side.split(" + "))): + tokens = term.split() + met_token = tokens[-1] + coef_tokens = tokens[:-1] + if coef_tokens and _NUMERIC.match(coef_tokens[0]): + coef = float(coef_tokens[0]) + else: + coef = 1.0 + if coef_tokens: # a symbolic coefficient like 'n' or '(n+1)' + undefined = True + match = _MET_TOKEN.match(met_token) + if not match: # unparseable term -> flag, keep raw token + undefined = True + met_id = met_token + else: + met_id = match.group(1) + stoich[met_id] = stoich.get(met_id, 0.0) + sign * coef + # Drop metabolites that cancel out (A <=> A + B leaves A at 0). + stoich = {m: c for m, c in stoich.items() if c != 0.0} + return stoich, reversible, undefined + + +def parse_kegg_reactions(kegg_dir: str | Path) -> list[KeggReaction]: + """Parse ``/reaction`` into :class:`KeggReaction` records. + + Reversibility is taken from the equation arrow and, when + ``reaction_mapformula.lst`` is present, refined to mark reactions that are + irreversible across all KEGG maps (see :func:`_irreversible_from_mapformula`). + """ + kegg_dir = Path(kegg_dir) + reactions: list[KeggReaction] = [] + for entry in _iter_entries(kegg_dir / "reaction"): + rxn = KeggReaction(id=_first_id(entry.get("ENTRY", []))) + if not rxn.id: + continue + if entry.get("NAME"): + rxn.name = entry["NAME"][0].rstrip(";").strip() + if entry.get("COMMENT"): + _comment_flags(rxn, " ".join(entry["COMMENT"])) + if entry.get("ENZYME"): + rxn.eccodes = [ec for line in entry["ENZYME"] for ec in line.split()] + rxn.kos = [line[:_ID_LEN].strip() for line in entry.get("ORTHOLOGY", [])] + for line in entry.get("PATHWAY", []): + pid = line[:7].strip() + if pid and not pid.startswith(("rn011", "rn012")): # skip global/overview + rxn.pathways.append(pid) + if entry.get("EQUATION"): + rxn.equation = " ".join(s.strip() for s in entry["EQUATION"]) + stoich, rxn.reversible, rxn.undefined_stoich = _parse_equation(rxn.equation) + rxn.stoichiometry = stoich # cached for build_reference_model + reactions.append(rxn) + + irrev = _irreversible_from_mapformula(kegg_dir / "reaction_mapformula.lst") + for rxn in reactions: + if rxn.id in irrev: + rxn.reversible = False + return reactions + + +def _irreversible_from_mapformula(path: str | Path) -> set[str]: + """Reaction ids that are irreversible in *every* KEGG map they appear in. + + ``reaction_mapformula.lst`` lines look like ``R00005: 00330: C01010 => C00011``. + A reaction is considered irreversible only if no map lists it as ``<=>`` and + every map draws it in the same direction. Direction (substrate/product order) + is not propagated back into the model stoichiometry — a documented + simplification of RAVEN's column-flipping logic, which only affects the small + set of map-directional reactions. + """ + path = Path(path) + if not path.is_file(): + return set() + seen_reversible: set[str] = set() + products: dict[str, str] = {} + conflicting: set[str] = set() + for entry in _iter_mapformula_lines(path): + rid, reversible, product = entry + if reversible: + seen_reversible.add(rid) + elif rid in products and products[rid] != product: + conflicting.add(rid) # drawn both directions across maps -> reversible + else: + products.setdefault(rid, product) + return {rid for rid in products if rid not in seen_reversible and rid not in conflicting} + + +def _iter_mapformula_lines(path: Path) -> Iterator[tuple[str, bool, str]]: + with open(path, encoding="utf-8") as handle: + for raw in handle: + line = raw.strip() + if not line or ":" not in line: + continue + rid = line[:_ID_LEN] + reversible = "<=>" in line + product = line.split()[-1] + yield rid, reversible, product + + +# --------------------------------------------------------------------------- # +# Compounds +# --------------------------------------------------------------------------- # +@dataclass +class KeggCompound: + """A metabolite parsed from the KEGG ``compound`` flat file.""" + + id: str + name: str = "" + formula: str = "" + inchi: str = "" + chebi: list[str] = field(default_factory=list) + pubchem: list[str] = field(default_factory=list) + + +def parse_kegg_compounds(kegg_dir: str | Path) -> list[KeggCompound]: + """Parse ``/compound`` (+ optional ``compound.inchi``) into records.""" + kegg_dir = Path(kegg_dir) + compounds: list[KeggCompound] = [] + for entry in _iter_entries(kegg_dir / "compound"): + cid = _first_id(entry.get("ENTRY", [])) + if not cid: + continue + cmp = KeggCompound(id=cid) + if entry.get("NAME"): + # Only the first synonym; KEGG separates them with ';'. + cmp.name = entry["NAME"][0].split(";")[0].strip() + if entry.get("FORMULA"): + cmp.formula = entry["FORMULA"][0].strip() + for line in entry.get("DBLINKS", []): + if line.startswith("ChEBI:"): + cmp.chebi += [f"CHEBI:{x}" for x in line.split(":", 1)[1].split()] + elif line.startswith("PubChem:"): + cmp.pubchem += line.split(":", 1)[1].split() + compounds.append(cmp) + + inchis = _parse_inchis(kegg_dir / "compound.inchi") + for cmp in compounds: + if cmp.id in inchis: + cmp.inchi = inchis[cmp.id] + cmp.formula = "" # prefer the InChI; matches RAVEN + return compounds + + +def _parse_inchis(path: str | Path) -> dict[str, str]: + path = Path(path) + if not path.is_file(): + return {} + out: dict[str, str] = {} + with open(path, encoding="utf-8") as handle: + for raw in handle: + cid, _, inchi = raw.rstrip("\n").partition("\t") + if cid and inchi: + out[cid.strip()] = inchi.strip() + return out + + +# --------------------------------------------------------------------------- # +# KOs and organism genes +# --------------------------------------------------------------------------- # +@dataclass +class KeggKO: + """A KEGG Orthology entry: its name and the organism genes assigned to it.""" + + id: str + name: str = "" + genes: list[tuple[str, str]] = field(default_factory=list) # (organism, gene) + + +def parse_kegg_kos(kegg_dir: str | Path, *, keep: set[str] | None = None) -> list[KeggKO]: + """Parse ``/ko`` into :class:`KeggKO` records (name + organism genes). + + ``keep`` limits parsing to those KO ids (e.g. only KOs linked to reactions), + mirroring RAVEN's ``koList`` argument — the gene lists are huge, so this is + the usual call. + """ + ko_records: list[KeggKO] = [] + for entry in _iter_entries(Path(kegg_dir) / "ko"): + ko_id = _first_id(entry.get("ENTRY", [])) + if not ko_id or (keep is not None and ko_id not in keep): + continue + ko = KeggKO(id=ko_id) + if entry.get("DEFINITION"): + ko.name = entry["DEFINITION"][0].strip() + ko.genes = list(_parse_gene_lines(entry.get("GENES", []))) + ko_records.append(ko) + return ko_records + + +def _parse_gene_lines(lines: list[str]) -> Iterator[tuple[str, str]]: + """Yield ``(organism, gene)`` pairs from a KO entry's GENES block. + + Lines look like ``BSU: BSU31050(gbsB) BSU31060`` — an upper-case organism + code, a colon, then space-separated gene ids (with an optional ``(name)`` + suffix that we strip). Organism codes are lower-cased to match KEGG's protein + sequence files (as RAVEN does). + """ + for line in lines: + org, sep, rest = line.partition(":") + if not sep: + continue + organism = org.strip().lower() + for token in rest.split(): + gene = token.split("(", 1)[0] + if gene: + yield organism, gene + + +# --------------------------------------------------------------------------- # +# Reference model + tables +# --------------------------------------------------------------------------- # +_COMPARTMENT = "s" # single 'system' compartment, as in getModelFromKEGG + + +def build_reference_model( + reactions: list[KeggReaction], compounds: list[KeggCompound] +) -> cobra.Model: + """Assemble the gene-free KEGG reference model from parsed records. + + Only metabolites actually used by a reaction are added. Reactions carry KEGG + annotations (reaction id, KO ids, EC codes, pathways) but **no genes/GPRs**. + Bounds are ``(-1000, 1000)`` for reversible reactions and ``(0, 1000)`` + otherwise. + """ + model = cobra.Model("KEGG") + model.name = "Automatically generated from KEGG database" + + by_id = {c.id: c for c in compounds} + # Reuse the cached parse from parse_kegg_reactions; only re-parse for + # callers that constructed KeggReaction records without the cache. + parsed = { + r.id: (r.stoichiometry if r.stoichiometry else _parse_equation(r.equation)[0]) + for r in reactions + } + used = {m for stoich in parsed.values() for m in stoich} + + metabolites = [] + for cid in sorted(used): + cmp = by_id.get(cid) + met = cobra.Metabolite(cid, compartment=_COMPARTMENT) + if cmp: + met.name = cmp.name or cid + met.formula = cmp.formula or None + if cmp.chebi: + met.annotation["chebi"] = cmp.chebi + if cmp.pubchem: + met.annotation["pubchem.substance"] = cmp.pubchem + if cmp.inchi: + met.annotation["inchi"] = cmp.inchi + else: + met.name = cid + metabolites.append(met) + model.add_metabolites(metabolites) + met_index = {m.id: m for m in metabolites} + + cobra_reactions = [] + for rxn in reactions: + stoich = parsed[rxn.id] + if not stoich: # empty (e.g. A <=> A) -> skip, as RAVEN drops bad rxns + continue + reaction = cobra.Reaction(rxn.id, name=rxn.name) + reaction.bounds = (-1000.0, 1000.0) if rxn.reversible else (0.0, 1000.0) + reaction.add_metabolites({met_index[m]: c for m, c in stoich.items()}) + reaction.annotation["kegg.reaction"] = rxn.id + if rxn.kos: + reaction.annotation["kegg.orthology"] = rxn.kos + if rxn.eccodes: + reaction.annotation["ec-code"] = rxn.eccodes + if rxn.pathways: + reaction.annotation["kegg.pathway"] = rxn.pathways + cobra_reactions.append(reaction) + model.add_reactions(cobra_reactions) + return model + + +def build_kegg_tables( + reactions: list[KeggReaction], kos: list[KeggKO] +) -> dict[str, pd.DataFrame]: + """Build the minimal relational tables from parsed records. + + Returns a dict of ``DataFrame``s keyed by table name: ``ko_reaction``, + ``ko_names``, ``organism_gene_ko``, ``rxn_flags``. + """ + ko_reaction = pd.DataFrame( + [(ko, r.id) for r in reactions for ko in r.kos], + columns=["ko", "reaction"], + ).drop_duplicates(ignore_index=True) + + ko_names = pd.DataFrame( + [(ko.id, ko.name) for ko in kos], columns=["ko", "name"] + ) + + organism_gene_ko = pd.DataFrame( + [(org, gene, ko.id) for ko in kos for org, gene in ko.genes], + columns=["organism", "gene", "ko"], + ).drop_duplicates(ignore_index=True) + + rxn_flags = pd.DataFrame( + [ + (r.id, r.spontaneous, r.undefined_stoich, r.incomplete, r.general) + for r in reactions + ], + columns=["reaction", "spontaneous", "undefined_stoich", "incomplete", "general"], + ) + + return { + "ko_reaction": ko_reaction, + "ko_names": ko_names, + "organism_gene_ko": organism_gene_ko, + "rxn_flags": rxn_flags, + } + + +def write_kegg_tables(tables: dict[str, pd.DataFrame], out_dir: str | Path) -> list[Path]: + """Write each table as a gzipped TSV (``.tsv.gz``) into ``out_dir``. + + Gzipped TSV is the dependency-free cross-language format shared with MATLAB + RAVEN (see docs/kegg_data_format.md). Returns the written paths. + """ + out_dir = Path(out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + written = [] + for name, frame in tables.items(): + path = out_dir / f"{name}.tsv.gz" + with gzip.open(path, "wt", encoding="utf-8", newline="") as handle: + frame.to_csv(handle, sep="\t", index=False) + written.append(path) + return written + + +def read_kegg_table(path: str | Path) -> pd.DataFrame: + """Read a KEGG table written by :func:`write_kegg_tables` or + :func:`stream_organism_gene_ko`. + + Compression is inferred from the suffix, so both the gzipped small tables + (``.tsv.gz``) and the xz-compressed ``organism_gene_ko.tsv.xz`` are read + transparently. + """ + return pd.read_csv(path, sep="\t", dtype=str, keep_default_na=False) + + +def _flush_sorted_run(rows: list[str], tmp_dir: Path, run_no: int) -> Path: + """Sort a buffer of ``organism\\tgene\\tko\\n`` lines and write one gzipped run.""" + rows.sort(key=_ogk_sort_key) + run_path = tmp_dir / f"run_{run_no:04d}.gz" + with gzip.open(run_path, "wt", encoding="utf-8", newline="") as run: + run.writelines(rows) + return run_path + + +def _ogk_sort_key(line: str) -> tuple[str, str]: + """Sort key ``(organism, gene)`` for an ``organism\\tgene\\tko`` line.""" + organism, gene, _ = line.split("\t", 2) + return organism, gene + + +def stream_organism_gene_ko( + kegg_dir: str | Path, keep: set[str], ogk_path: str | Path, *, chunk_rows: int = 1_000_000 +) -> pd.DataFrame: + """Stream the ``ko`` file to a sorted, xz-compressed ``organism_gene_ko.tsv.xz``. + + Real KEGG has ~9M gene↔KO associations — far too many to hold in memory as a + DataFrame. Rows are sorted by ``(organism, gene)`` before writing: gene IDs + from one organism share long common prefixes (locus tags, numeric runs), so + sorting makes them adjacent and lets the compressor shrink the table ~2.9x + versus the unsorted gzip form. The order also matches the by-organism query + pattern in :func:`get_kegg_model_for_organism`. + + The sort is an **external merge sort** bounded to ``chunk_rows`` rows in + memory at a time (sorted runs spooled to gzipped temp files, then merged with + :func:`heapq.merge`), so peak memory stays flat regardless of KEGG size. Only + the small ``ko_names`` table (one row per KO) is held in full and returned. + """ + ogk_path = Path(ogk_path) + names: list[tuple[str, str]] = [] + buffer: list[str] = [] + runs: list[Path] = [] + + with tempfile.TemporaryDirectory(prefix="ogk_sort_", dir=ogk_path.parent) as tmp: + tmp_dir = Path(tmp) + for entry in _iter_entries(Path(kegg_dir) / "ko"): + ko_id = _first_id(entry.get("ENTRY", [])) + if not ko_id or ko_id not in keep: + continue + names.append((ko_id, entry["DEFINITION"][0].strip() if entry.get("DEFINITION") else "")) + for organism, gene in _parse_gene_lines(entry.get("GENES", [])): + buffer.append(f"{organism}\t{gene}\t{ko_id}\n") + if len(buffer) >= chunk_rows: + runs.append(_flush_sorted_run(buffer, tmp_dir, len(runs))) + buffer = [] + if buffer: + runs.append(_flush_sorted_run(buffer, tmp_dir, len(runs))) + + handles = [gzip.open(r, "rt", encoding="utf-8") for r in runs] + try: + with lzma.open(ogk_path, "wt", encoding="utf-8", newline="") as out: + out.write("organism\tgene\tko\n") + out.writelines(heapq.merge(*handles, key=_ogk_sort_key)) + finally: + for h in handles: + h.close() + return pd.DataFrame(names, columns=["ko", "name"]) + + +def parse_kegg_dump(kegg_dir: str | Path, out_dir: str | Path) -> dict[str, Path]: + """Parse a full KEGG dump into the reference model + tables and write them out. + + Writes ``reference_model.yml.gz`` (gzipped RAVEN/cobra YAML) plus the + gzipped-TSV tables into ``out_dir`` and returns ``{name: path}`` for + everything written. The large + ``organism_gene_ko`` table is streamed to disk (see + :func:`stream_organism_gene_ko`) rather than built in memory, so this scales + to the full KEGG database; the small derived tables are built in memory. + """ + out_dir = Path(out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + reactions = parse_kegg_reactions(kegg_dir) + compounds = parse_kegg_compounds(kegg_dir) + linked_kos = {ko for r in reactions for ko in r.kos} + + model = build_reference_model(reactions, compounds) + + small = { + "ko_reaction": pd.DataFrame( + [(ko, r.id) for r in reactions for ko in r.kos], columns=["ko", "reaction"] + ).drop_duplicates(ignore_index=True), + "rxn_flags": pd.DataFrame( + [(r.id, r.spontaneous, r.undefined_stoich, r.incomplete, r.general) for r in reactions], + columns=["reaction", "spontaneous", "undefined_stoich", "incomplete", "general"], + ), + } + paths = {name: p for name, p in zip(small, write_kegg_tables(small, out_dir), strict=True)} + + ogk_path = out_dir / "organism_gene_ko.tsv.xz" + ko_names = stream_organism_gene_ko(kegg_dir, linked_kos, ogk_path) + paths["organism_gene_ko"] = ogk_path + paths.update( + zip(["ko_names"], write_kegg_tables({"ko_names": ko_names}, out_dir), strict=True) + ) + + ref_path = out_dir / "reference_model.yml.gz" + write_yaml_model(model, ref_path) + paths["reference_model"] = ref_path + return paths diff --git a/src/raven_python/reconstruction/kegg/query.py b/src/raven_python/reconstruction/kegg/query.py new file mode 100644 index 0000000..2df3f78 --- /dev/null +++ b/src/raven_python/reconstruction/kegg/query.py @@ -0,0 +1,231 @@ +"""De-novo KEGG draft from a proteome FASTA via HMM search (step 3b.5). + +Ports the FASTA/HMM branch of RAVEN ``getKEGGModelForOrganism``: search a query +proteome against the KO profile-HMM library (3b.3), assign genes to KOs using the +score cut-off and the two score-ratio filters, then build the draft model with the +shared assembler. For organisms not in KEGG. + +Improvement over RAVEN: one ``hmmscan`` against the single ``hmmpress``-ed library +(K7) replaces RAVEN's per-KO ``hmmsearch`` loop. Phylogenetic-distance subsampling +is **not** used — our prebuilt prok90/euk90 libraries already fix the sequence set, +so picking the right domain library (not per-organism distance weighting) is the +relevant choice. + +The scoring/assignment logic (:func:`assign_kos`, :func:`parse_hmmscan_tblout`) is +pure and unit-tested; running the search needs HMMER (``hmmscan``). +""" +from __future__ import annotations + +import math +import subprocess +import tempfile +from pathlib import Path + +import cobra +import pandas as pd + +from raven_python.binaries import resolve_binary +from raven_python.io.yaml import read_yaml_model +from raven_python.reconstruction.kegg.assemble import assemble_model_from_ko_genes +from raven_python.reconstruction.kegg.parse import read_kegg_table + +_NOTE = "Included by get_kegg_model_from_sequences (using HMMs)" +_MIN_EVALUE = 1e-250 # floor for a reported E-value of 0, to keep logs finite + + +def run_hmmscan( + fasta: str | Path, + library: str | Path, + *, + threads: int = 1, + hmmscan: str | Path | None = None, +) -> str: + """Run ``hmmscan`` of ``fasta`` against the pressed ``library``; return tblout text.""" + exe = resolve_binary("hmmscan", binary=hmmscan) + with tempfile.TemporaryDirectory() as tmp: + tbl = Path(tmp) / "hits.tbl" + cmd = [exe, "--cpu", str(threads), "--tblout", str(tbl), str(library), str(fasta)] + proc = subprocess.run(cmd, capture_output=True, text=True) + if proc.returncode != 0: + raise RuntimeError(f"hmmscan failed:\n{(proc.stderr or '').strip()}") + return tbl.read_text() + + +def parse_hmmscan_tblout(text: str) -> pd.DataFrame: + """Parse ``hmmscan --tblout`` text into a ``[ko, gene, evalue]`` table. + + In ``hmmscan`` the HMM database is the *target*, so column 1 (target name) is + the KO, column 3 (query name) is the proteome gene, and column 5 is the + full-sequence E-value. + """ + rows = [] + for line in text.splitlines(): + if not line or line.startswith("#"): + continue + fields = line.split() + if len(fields) < 5: + continue + rows.append((fields[0], fields[2], float(fields[4]))) + return pd.DataFrame(rows, columns=["ko", "gene", "evalue"]) + + +def assign_kos( + hits: pd.DataFrame, + *, + cutoff: float = 1e-30, + min_score_ratio_ko: float = 0.3, + min_score_ratio_g: float = 0.9, +) -> dict[str, list[str]]: + """Assign genes to KOs from HMM hits, applying the cut-off and ratio filters. + + Ports RAVEN's three steps on the KO×gene E-value matrix: + + 1. keep hits with ``evalue <= cutoff``; + 2. **min_score_ratio_ko** — within a KO, drop genes whose + ``log(evalue)/log(best_evalue_in_KO) < min_score_ratio_ko`` (prune weak + members of a KO); + 3. **min_score_ratio_g** — within a gene, drop KOs whose + ``log(evalue)/log(best_evalue_for_gene) < min_score_ratio_g`` (stop a gene + that clearly belongs to one KO leaking into weaker ones). + + Smaller E-value = better; since all kept values are ``< 1`` their logs are + negative, so the best (smallest) hit gives ratio 1 and weaker hits give a + smaller positive ratio. + + Default calibration (see IMPROVEMENTS K15). Cross-validated against the true + KEGG gene→KO annotation of four organisms spanning the prok/euk libraries and + the well-/lesser-studied axis (*S. cerevisiae*, *Cyanidioschyzon merolae*, + *E. coli*, *Mycoplasma genitalium*): real annotations score + overwhelmingly (median E ≈ 1e-100…1e-155) while spurious hits pile up at + ≈1e-8, so the two are separated by ~20 orders of magnitude. RAVEN's + ``1e-50`` sits inside the *true* tail and silently drops real but divergent + hits — costing 16% gene→KO recall on the divergent minimal genome + (*M. genitalium*) for no noise-rejection benefit (noise is far weaker). The + default is therefore loosened to **1e-30** (recovers that tail; still ~22 + orders above the noise floor), with the precision work moved to + **min_score_ratio_g = 0.9** — the *effective* precision lever (it resolves + multi-KO genes). ``min_score_ratio_ko`` proved empirically inert across all + four organisms (identical output at 0.0/0.3/0.5) and is kept only for RAVEN + parity. + """ + # The ratio filters compare log(evalue)/log(best_evalue); when best == 1.0 + # the denominator is 0 → ZeroDivisionError. The default cutoff (1e-30) keeps + # us safely away, but a caller-passed cutoff ≥ 1 is ambiguous and would + # crash later. Reject it up front with a clear message. + if cutoff >= 1: + raise ValueError( + f"cutoff must be < 1 (smaller E-value = better hit); got {cutoff!r}." + ) + + # Best (smallest) E-value per (ko, gene), filtered at the cut-off. + mat: dict[str, dict[str, float]] = {} + for ko, gene, evalue in zip(hits["ko"], hits["gene"], hits["evalue"], strict=True): + if evalue > cutoff: + continue + e = evalue if evalue > 0 else _MIN_EVALUE + per_ko = mat.setdefault(ko, {}) + if gene not in per_ko or e < per_ko[gene]: + per_ko[gene] = e + + # Step 2: prune weak genes within each KO. + for ko, genes in mat.items(): + log_best = math.log(min(genes.values())) + mat[ko] = { + g: e for g, e in genes.items() if math.log(e) / log_best >= min_score_ratio_ko + } + + # Step 3: prune weak KOs within each gene (over the survivors of step 2). + gene_kos: dict[str, dict[str, float]] = {} + for ko, genes in mat.items(): + for g, e in genes.items(): + gene_kos.setdefault(g, {})[ko] = e + dropped: set[tuple[str, str]] = set() + for g, kos in gene_kos.items(): + log_best = math.log(min(kos.values())) + for ko, e in kos.items(): + if math.log(e) / log_best < min_score_ratio_g: + dropped.add((ko, g)) + + result: dict[str, list[str]] = {} + for ko, genes in mat.items(): + kept = sorted(g for g in genes if (ko, g) not in dropped) + if kept: + result[ko] = kept + return result + + +def get_kegg_model_from_sequences( + fasta: str | Path, + reference_model: cobra.Model, + ko_reaction: pd.DataFrame, + library: str | Path, + *, + rxn_flags: pd.DataFrame | None = None, + model_id: str | None = None, + cutoff: float = 1e-30, + min_score_ratio_ko: float = 0.3, + min_score_ratio_g: float = 0.9, + keep_spontaneous: bool = True, + keep_undefined_stoich: bool = True, + keep_incomplete: bool = True, + keep_general: bool = False, + threads: int = 1, + hmmscan: str | Path | None = None, +) -> cobra.Model: + """Reconstruct a draft model for a proteome by HMM-searching the KO library. + + Searches ``fasta`` against the pressed ``library`` (3b.3), assigns KOs + (:func:`assign_kos`), and assembles the model against ``reference_model`` / + ``ko_reaction``. Genes are the query proteome's identifiers. + """ + hits = parse_hmmscan_tblout(run_hmmscan(fasta, library, threads=threads, hmmscan=hmmscan)) + ko_to_genes = assign_kos( + hits, + cutoff=cutoff, + min_score_ratio_ko=min_score_ratio_ko, + min_score_ratio_g=min_score_ratio_g, + ) + model, _ = assemble_model_from_ko_genes( + reference_model, + ko_reaction, + ko_to_genes, + rxn_flags=rxn_flags, + keep_spontaneous=keep_spontaneous, + keep_undefined_stoich=keep_undefined_stoich, + keep_incomplete=keep_incomplete, + keep_general=keep_general, + model_id=model_id, + note=_NOTE, + ) + return model + + +def get_kegg_model_from_sequences_with_artefacts( + fasta: str | Path, + artefact_dir: str | Path | None = None, + library: str | Path | None = None, + *, + domain: str = "prokaryotes", + version: str | None = None, + **kwargs, +) -> cobra.Model: + """Load reference model + tables from ``artefact_dir`` and run the HMM query. + + If ``artefact_dir`` / ``library`` are ``None`` they are fetched/cached via + :func:`raven_python.data.ensure_kegg_data` / :func:`raven_python.data.ensure_kegg_hmm_library` + (``domain`` selects the prok/euk library; ``version`` the release). + """ + if artefact_dir is None or library is None: + from raven_python.data import ensure_kegg_data, ensure_kegg_hmm_library + + if artefact_dir is None: + artefact_dir = ensure_kegg_data(version=version) + if library is None: + library = ensure_kegg_hmm_library(domain, version=version) + artefact_dir = Path(artefact_dir) + reference_model = read_yaml_model(artefact_dir / "reference_model.yml.gz") + ko_reaction = read_kegg_table(artefact_dir / "ko_reaction.tsv.gz") + rxn_flags = read_kegg_table(artefact_dir / "rxn_flags.tsv.gz") + return get_kegg_model_from_sequences( + fasta, reference_model, ko_reaction, library, rxn_flags=rxn_flags, **kwargs + ) diff --git a/src/raven_python/reconstruction/kegg/taxonomy.py b/src/raven_python/reconstruction/kegg/taxonomy.py new file mode 100644 index 0000000..463fcce --- /dev/null +++ b/src/raven_python/reconstruction/kegg/taxonomy.py @@ -0,0 +1,71 @@ +"""Parse the KEGG ``taxonomy`` file into per-organism category lineages. + +Ports the file-reading half of RAVEN ``getPhylDist`` (the distance-matrix half is +step 3b.5). The ``taxonomy`` file is an indented tree: ``#``-prefixed lines name a +category, the number of leading ``#`` giving its depth; organism lines are +tab-separated ``T-numberorg_idname...``. Each organism inherits the +stack of categories above it, the first of which is its domain (``Prokaryotes`` / +``Eukaryotes``). + +Used by 3b.3 to split genes into the prok/euk HMM libraries, and (later) by 3b.5 +for phylogenetic distances. +""" +from __future__ import annotations + +import warnings +from pathlib import Path + + +def parse_taxonomy(path: str | Path) -> dict[str, list[str]]: + """Return ``{organism_id: [category, ...]}`` from outermost to innermost.""" + org_categories: dict[str, list[str]] = {} + stack: list[str] = [] + skipped_level_warned = False + with open(path, encoding="utf-8") as handle: + for line_no, raw in enumerate(handle, start=1): + line = raw.rstrip("\n") + if not line.strip(): + continue + if line.startswith("#"): + depth = len(line) - len(line.lstrip("#")) + name = line[depth:].strip() + if depth - 1 > len(stack): + # Depth-skip (e.g. ## then ####): the original `stack[:depth-1]` + # silently produced a too-short lineage. Pad with explicit + # blanks so downstream slices stay aligned; warn once. + if not skipped_level_warned: + warnings.warn( + f"{path}: taxonomy depth skips a level near line {line_no} " + f"({'#' * depth} {name!r} appeared with stack {stack!r}); " + "padding the missing levels with '' (later occurrences silenced).", + stacklevel=2, + ) + skipped_level_warned = True + stack = stack + [""] * (depth - 1 - len(stack)) + else: + stack = stack[: depth - 1] + stack.append(name) + else: + fields = line.split("\t") if "\t" in line else line.split() + if len(fields) < 2: + continue + org_categories[fields[1].strip()] = list(stack) + return org_categories + + +def organism_domains(path: str | Path) -> dict[str, str]: + """Return ``{organism_id: domain}`` (the outermost category).""" + return {org: cats[0] for org, cats in parse_taxonomy(path).items() if cats} + + +def organisms_in_domain(path: str | Path, domain: str) -> set[str]: + """Organism ids whose outermost category matches ``domain`` (case-insensitive). + + Accepts a prefix, so ``"prok"`` matches ``"Prokaryotes"``. + """ + needle = domain.lower() + return { + org + for org, dom in organism_domains(path).items() + if dom.lower().startswith(needle) or needle.startswith(dom.lower()) + } diff --git a/src/raven_python/tasks/__init__.py b/src/raven_python/tasks/__init__.py new file mode 100644 index 0000000..d232c16 --- /dev/null +++ b/src/raven_python/tasks/__init__.py @@ -0,0 +1,23 @@ +"""Metabolic task definition, parsing, and checking. + +* :class:`Task` + :func:`parse_task_list` — the task-list file format. +* :func:`check_tasks` + :class:`TaskResult` — run tasks against a model. +* :func:`find_task_essential_reactions` + :class:`EssentialReactionsResult` — reactions + a model must use to satisfy a task list (the input for (f)tINIT's task layer). +""" +from raven_python.tasks.check import ( + EssentialReactionsResult, + TaskResult, + check_tasks, + find_task_essential_reactions, +) +from raven_python.tasks.tasklist import Task, parse_task_list + +__all__ = [ + "EssentialReactionsResult", + "Task", + "TaskResult", + "check_tasks", + "find_task_essential_reactions", + "parse_task_list", +] diff --git a/src/raven_python/tasks/check.py b/src/raven_python/tasks/check.py new file mode 100644 index 0000000..817bae5 --- /dev/null +++ b/src/raven_python/tasks/check.py @@ -0,0 +1,332 @@ +"""Check whether a model performs a set of metabolic tasks. + +For each task the model is constrained by the task's allowed inputs/outputs (and any +extra reactions / bound changes), then tested for feasibility: a task *passes* if a +steady-state flux exists, unless it is marked ``should_fail`` (then it passes iff +infeasible). + +Inputs/outputs are encoded as ranges on the per-metabolite mass-balance constraint +(``model.constraints[met.id]``): an input allows net consumption (``Sv ∈ [-UB, -LB]``) +and an output allows / requires net production (``Sv ≤ UB``, and ``≥ LB`` if +``LB > 0``). Existing boundary reactions are closed first, so inputs/outputs are +defined solely by the task (closed-model semantics). +""" +from __future__ import annotations + +import pickle +from collections.abc import Iterable +from dataclasses import dataclass +from pathlib import Path + +import cobra +from cobra.exceptions import OptimizationError +from cobra.flux_analysis import flux_variability_analysis, pfba +from optlang.symbolics import Zero + +from raven_python.manipulation.add import add_reactions_from_equations +from raven_python.tasks.tasklist import Task, parse_task_list + +_ALLMETS = "ALLMETS" +_ALLMETSIN = "ALLMETSIN" + + +@dataclass +class TaskResult: + """Result of one task: ``passed`` is the verdict (accounts for ``should_fail``).""" + + id: str + description: str + passed: bool + feasible: bool + error: str | None = None + + +def _set_constraint_bounds(constraint, lb: float, ub: float) -> None: + """Set an optlang constraint's bounds without a transient lb > ub.""" + if lb > constraint.ub: + constraint.ub = ub + constraint.lb = lb + else: + constraint.lb = lb + constraint.ub = ub + + +def _classify(token: str) -> tuple[str, str | None]: + """Return ``("all", None)``, ``("comp", COMP)``, or ``("met", token_upper)``.""" + upper = token.upper() + if upper == _ALLMETS: + return "all", None + if upper.startswith(_ALLMETSIN + "[") and upper.endswith("]"): + return "comp", upper[len(_ALLMETSIN) + 1: -1] + return "met", upper # incl. malformed ALLMETSIN[... → treated as a (missing) metabolite + + +def _metabolite_bounds( + task: Task, name_to_ids: dict[str, list[str]], comp_to_ids: dict[str, list[str]] +) -> tuple[dict[str, list[float]], list[str]]: + """Compute ``{met_id: [lb, ub]}`` from a task's inputs/outputs (RAVEN ``b``). + + Bulk tokens (ALLMETS / ALLMETSIN) are applied before specific metabolites, as + RAVEN does. Returns the bounds and a list of unresolved tokens (→ task error). + """ + bounds: dict[str, list[float]] = {} + missing: list[str] = [] + + def touch(mid: str) -> list[float]: + return bounds.setdefault(mid, [0.0, 0.0]) + + for entries, is_input in ((task.inputs, True), (task.outputs, False)): + bulk = [(t, lb, ub) for (t, lb, ub) in entries if _classify(t)[0] != "met"] + specific = [(t, lb, ub) for (t, lb, ub) in entries if _classify(t)[0] == "met"] + for token, lb, ub in bulk + specific: + kind, arg = _classify(token) + if kind == "all": + ids = [mid for group in comp_to_ids.values() for mid in group] + elif kind == "comp": + ids = comp_to_ids.get(arg, []) + else: + ids = name_to_ids.get(arg, []) + if not ids: + missing.append(token) + continue + for mid in ids: + b = touch(mid) + if is_input: + b[0] = -ub # allow net consumption up to UB (RAVEN b1 = -UBin) + if kind == "met": + b[1] = -lb + else: + b[1] = ub # allow net production up to UB + if kind == "met" and lb > 0: + b[0] = lb # require at least LB produced + return bounds, missing + + +def task_name_maps(model: cobra.Model) -> tuple[dict[str, list[str]], dict[str, list[str]]]: + """Build ``name[comp]→[ids]`` and ``comp→[ids]`` lookups for a model's metabolites. + + ``name[comp]`` maps to a *list* because a model can carry several metabolites with + the same name and compartment; a task referencing it constrains all of them (as + RAVEN does), rather than an arbitrary one. + """ + name_to_ids: dict[str, list[str]] = {} + comp_to_ids: dict[str, list[str]] = {} + for m in model.metabolites: + name_to_ids.setdefault(f"{m.name}[{m.compartment}]".upper(), []).append(m.id) + comp_to_ids.setdefault((m.compartment or "").upper(), []).append(m.id) + return name_to_ids, comp_to_ids + + +def apply_task_constraints( + model: cobra.Model, task: Task, name_to_id, comp_to_ids +) -> tuple[set[str], str | None]: + """Apply a task's inputs/outputs/equations/bound-changes to ``model`` in place. + + Sets a feasibility (zero) objective. Returns ``(task_metabolite_ids, error)``; + ``task_metabolite_ids`` are the model metabolites the task references (RAVEN's + ``essentialMetsForTasks``). On error the model may be partially modified. + """ + bounds, missing = _metabolite_bounds(task, name_to_id, comp_to_ids) + if missing: + return set(), f"unknown metabolite(s): {sorted(set(missing))}" + task_mets = {mid for mid in bounds} + for mid, (lb, ub) in bounds.items(): + if (lb, ub) != (0.0, 0.0): + _set_constraint_bounds(model.constraints[mid], lb, ub) + + if task.equations: + existing = {m.id for m in model.metabolites} + specs = [ + {"id": f"TASK_TMP_{i}", "equation": equ, "bounds": (lb, ub)} + for i, (equ, lb, ub) in enumerate(task.equations) + ] + add_reactions_from_equations(model, specs, mets_by="name", allow_new_mets=True) + for i in range(len(specs)): + tmp = model.reactions.get_by_id(f"TASK_TMP_{i}") + task_mets |= {m.id for m in tmp.metabolites if m.id in existing} + + for rxn_id, lb, ub in task.changed: + if rxn_id not in model.reactions: + return set(), f"CHANGED RXN not in model: {rxn_id!r}" + model.reactions.get_by_id(rxn_id).bounds = (lb, ub) + + model.objective = model.problem.Objective(Zero, direction="max") # feasibility only + return task_mets, None + + +def _build_task_model( + base: cobra.Model, task: Task, name_to_id, comp_to_ids +) -> tuple[cobra.Model | None, set[str], str | None]: + """Copy ``base`` and apply a task's constraints (``model``/``error`` exclusive).""" + model = base.copy() + task_mets, error = apply_task_constraints(model, task, name_to_id, comp_to_ids) + return (None if error else model), task_mets, error + + +def _run_task(base: cobra.Model, task: Task, name_to_id, comp_to_ids) -> TaskResult: + """Test one task by applying its constraints to ``base`` in place, then reverting. + + Avoids copying the (genome-scale) model per task — the copy dominates ``check_tasks`` + runtime. ``with base:`` reverts everything ``apply_task_constraints`` does through + cobra's API (temp reactions/metabolites for equations, reaction bounds, objective); + the one untracked change — direct metabolite mass-balance (``model.constraints[mid]``) + bound edits — is snapshotted and restored explicitly. Net result is identical to the + copy-based version but reuses a single model across all tasks. + """ + bounds, missing = _metabolite_bounds(task, name_to_id, comp_to_ids) + if missing: + return TaskResult(task.id, task.description, False, False, + f"unknown metabolite(s): {sorted(set(missing))}") + saved = {mid: (base.constraints[mid].lb, base.constraints[mid].ub) for mid in bounds} + try: + with base: # reverts temp reactions/mets, reaction bounds, objective on exit + _, error = apply_task_constraints(base, task, name_to_id, comp_to_ids) + if error is not None: + return TaskResult(task.id, task.description, False, False, error) + base.slim_optimize() + feasible = base.solver.status == "optimal" + finally: # restore the untracked metabolite-constraint bound edits + for mid, (lb, ub) in saved.items(): + _set_constraint_bounds(base.constraints[mid], lb, ub) + return TaskResult(task.id, task.description, feasible != task.should_fail, feasible) + + +def check_tasks( + model: cobra.Model, + tasks: str | Iterable[Task], + *, + close_boundaries: bool = True, +) -> list[TaskResult]: + """Run a task list against ``model`` and return a :class:`TaskResult` per task. + + ``tasks`` is a parsed list of :class:`Task` or a path to a task-list file. With + ``close_boundaries`` (default), existing exchange/sink/demand reactions are + closed so inputs/outputs are defined purely by the tasks (as RAVEN assumes). + """ + tasks = _as_tasks(tasks) + base, name_to_id, comp_to_ids = _prepare_base(model, close_boundaries) + return [_run_task(base, task, name_to_id, comp_to_ids) for task in tasks] + + +def _as_tasks(tasks: str | Iterable[Task]) -> list[Task]: + if isinstance(tasks, (str, bytes)) or hasattr(tasks, "__fspath__"): + return parse_task_list(tasks) + return list(tasks) + + +def _prepare_base(model: cobra.Model, close_boundaries: bool): + base = model.copy() + if close_boundaries: + for rxn in base.boundary: + rxn.bounds = (0.0, 0.0) + name_to_id, comp_to_ids = task_name_maps(base) + return base, name_to_id, comp_to_ids + + +@dataclass +class EssentialReactionsResult: + """Reactions a model *must* use to perform a task list (RAVEN ``essentialRxns``). + + ``reactions`` maps reaction id → forced flux direction (``+1`` forward, ``-1`` + reverse): the reaction must carry flux of that sign in every feasible solution of + at least one task. ``per_task`` is the same, split by task id. ``task_metabolites`` + are the model metabolites the tasks reference (RAVEN ``essentialMetsForTasks``, + protected from removal). ``failed_tasks`` are tasks that were infeasible or + malformed and thus skipped (RAVEN drops these from the task list). + """ + + reactions: dict[str, int] + per_task: dict[str, dict[str, int]] + task_metabolites: set[str] + failed_tasks: list[str] + + +def _task_essential_reactions( + task_model: cobra.Model, candidates: list[str], tol: float +) -> dict[str, int]: + """Reactions in ``candidates`` forced to carry flux, with direction, via FVA. + + A reaction is *essential* for the task iff zero is not attainable in any feasible + solution — i.e. its FVA range excludes 0. This is exactly RAVEN's + "constrain to 0 → infeasible" definition, but obtained from FVA ranges (no + per-reaction knockout loop). The nonzero side of the range gives the forced + direction. FVA is restricted to ``candidates`` — the reactions carrying flux in a + minimal feasible solution, the only ones that *can* be essential (an essential + reaction is nonzero in every feasible solution, so also in that one) — which keeps + this cheap on genome-scale templates instead of ranging all reactions. + """ + if not candidates: + return {} + fva = flux_variability_analysis(task_model, reaction_list=candidates, fraction_of_optimum=0.0) + essential: dict[str, int] = {} + for rxn_id, lo, hi in zip(fva.index, fva["minimum"], fva["maximum"], strict=True): + if lo > tol: + essential[rxn_id] = 1 + elif hi < -tol: + essential[rxn_id] = -1 + return essential + + +def find_task_essential_reactions( + model: cobra.Model, + tasks: str | Iterable[Task], + *, + close_boundaries: bool = True, + tol: float = 1e-8, + cache_path: str | Path | None = None, +) -> EssentialReactionsResult: + """Find the reactions a model must use to satisfy a task list. + + For each task the model is constrained as in :func:`check_tasks`, then FVA + identifies reactions whose flux can never be zero (essential) and their forced + direction. This is the ``prepINITModel`` step that feeds (ft)INIT: essential + reactions are kept regardless of expression score and made irreversible in their + forced direction. When a reaction is essential in several tasks with conflicting + directions, the majority wins (ties → forward), matching RAVEN's ``pos < neg``. + + On a genome-scale model this is slow (an FVA per task). Pass ``cache_path`` to make + it **resumable**: each task's result is written there as it completes (atomically), + and a re-run skips tasks already cached — so it survives interruptions and finishes + across several sessions. + """ + tasks = _as_tasks(tasks) + base, name_to_id, comp_to_ids = _prepare_base(model, close_boundaries) + original_ids = {r.id for r in base.reactions} + + per_task: dict[str, dict[str, int]] = {} + task_metabolites: set[str] = set() + failed: list[str] = [] + if cache_path is not None and Path(cache_path).exists(): + cached = pickle.load(open(cache_path, "rb")) + per_task, task_metabolites, failed = cached["per_task"], set(cached["mets"]), list(cached["failed"]) + + done = set(per_task) | set(failed) + for task in tasks: + if task.should_fail or task.id in done: + continue # a should-fail task defines no essentials; cached ones are skipped + task_model, task_mets, error = _build_task_model(base, task, name_to_id, comp_to_ids) + if error is not None: + failed.append(task.id) + else: + # One min-flux solve both proves feasibility and yields the essential-reaction + # candidates (the original reactions carrying flux in a sparse solution). + try: + fluxes = pfba(task_model).fluxes + candidates = [rid for rid in original_ids if abs(fluxes.get(rid, 0.0)) > tol] + task_metabolites |= task_mets + per_task[task.id] = _task_essential_reactions(task_model, candidates, tol) + except OptimizationError: + failed.append(task.id) + if cache_path is not None: # atomic checkpoint after each task + tmp = Path(f"{cache_path}.part") + pickle.dump({"per_task": per_task, "mets": task_metabolites, "failed": failed}, + open(tmp, "wb")) + tmp.replace(cache_path) + + # Majority direction; tie (sum == 0) → forward, as RAVEN's `pos < neg`. + direction_votes: dict[str, int] = {} + for essential in per_task.values(): + for rxn_id, direction in essential.items(): + direction_votes[rxn_id] = direction_votes.get(rxn_id, 0) + direction + reactions = {rid: (-1 if votes < 0 else 1) for rid, votes in direction_votes.items()} + return EssentialReactionsResult(reactions, per_task, task_metabolites, failed) diff --git a/src/raven_python/tasks/tasklist.py b/src/raven_python/tasks/tasklist.py new file mode 100644 index 0000000..5bdbcb0 --- /dev/null +++ b/src/raven_python/tasks/tasklist.py @@ -0,0 +1,141 @@ +"""Parse a metabolic task list. + +A task list defines, per task, allowed inputs/outputs, optional extra reactions +(equations), reaction-bound changes, and whether the task *should fail*. Tasks +are checked with :func:`raven_python.tasks.check_tasks`. + +The file is tab-delimited (``.txt``/``.tsv``) or Excel (``.xlsx``, sheet ``TASKS``; +needs the ``[excel]`` extra). Recognised columns (the only required one is ``ID``): + + ID · DESCRIPTION · IN · IN LB · IN UB · OUT · OUT LB · OUT UB · + EQU · EQU LB · EQU UB · CHANGED RXN · CHANGED LB · CHANGED UB · + SHOULD FAIL · PRINT FLUX · COMMENTS + +A task spans consecutive rows; only its first row carries an ID. Metabolites are +written ``name[compartment]``; several in one cell are separated by ``;`` (sharing +that row's bounds). ``IN``/``OUT`` default LB 0, UB 1000; ``EQU`` defaults LB +-1000 if reversible (``<=>``) else 0, UB 1000. The special tokens ``ALLMETS`` and +``ALLMETSIN[comp]`` allow free uptake/excretion of all metabolites (only the upper +bound is used). +""" +from __future__ import annotations + +import csv +import warnings +from dataclasses import dataclass, field +from pathlib import Path + +_COLUMNS = ( + "ID", "DESCRIPTION", "IN", "IN LB", "IN UB", "OUT", "OUT LB", "OUT UB", + "EQU", "EQU LB", "EQU UB", "CHANGED RXN", "CHANGED LB", "CHANGED UB", + "SHOULD FAIL", "PRINT FLUX", "COMMENTS", +) + + +@dataclass +class Task: + """One metabolic task. Bounds are ``(metabolite_or_reaction, lb, ub)`` triples.""" + + id: str + description: str = "" + should_fail: bool = False + print_fluxes: bool = False + comments: str = "" + inputs: list[tuple[str, float, float]] = field(default_factory=list) + outputs: list[tuple[str, float, float]] = field(default_factory=list) + equations: list[tuple[str, float, float]] = field(default_factory=list) + changed: list[tuple[str, float, float]] = field(default_factory=list) + + +def _truthy(value: str) -> bool: + return value.strip().lower() not in ("", "0", "false", "no") + + +def _num(value: str, default: float) -> float: + value = value.strip() + return float(value) if value else default + + +def _read_rows(path: str | Path) -> list[list[str]]: + path = Path(path) + if path.suffix.lower() in (".xlsx", ".xlsm"): + try: + from openpyxl import load_workbook + except ImportError as exc: # pragma: no cover - optional dep + raise ImportError("Reading .xlsx task lists needs the '[excel]' extra (openpyxl).") from exc + wb = load_workbook(path, data_only=True) + if "TASKS" not in wb.sheetnames: + raise ValueError( + f"{path}: workbook has no sheet named 'TASKS' " + f"(found: {wb.sheetnames}). Rename the sheet or pick that file." + ) + ws = wb["TASKS"] + return [["" if c is None else str(c) for c in row] for row in ws.iter_rows(values_only=True)] + with open(path, encoding="utf-8", newline="") as handle: + return [row for row in csv.reader(handle, delimiter="\t")] + + +def parse_task_list(path: str | Path) -> list[Task]: + """Parse a task-list file into :class:`Task` objects.""" + rows = _read_rows(path) + header_idx = next( + (i for i, r in enumerate(rows) if any(c.strip().upper() == "ID" for c in r)), None + ) + if header_idx is None: + raise ValueError(f"{path}: no header row with an 'ID' column found.") + header = [c.strip().upper() for c in rows[header_idx]] + col = {name: header.index(name) for name in _COLUMNS if name in header} + + def cell(row: list[str], name: str) -> str: + i = col.get(name) + return row[i].strip() if i is not None and i < len(row) else "" + + # Columns whose presence on an ID-less row signals real continuation data + # (vs. pure whitespace/comment), used by the orphan-row warning below. + _DATA_COLS = ("IN", "OUT", "EQU", "CHANGED RXN") + + tasks: list[Task] = [] + current: Task | None = None + for row_no, row in enumerate(rows[header_idx + 1:], start=header_idx + 2): + if not any(c.strip() for c in row): + continue + rid = cell(row, "ID") + if rid.startswith("#"): + continue + if rid: + current = Task( + id=rid, + description=cell(row, "DESCRIPTION"), + should_fail=_truthy(cell(row, "SHOULD FAIL")), + print_fluxes=_truthy(cell(row, "PRINT FLUX")), + comments=cell(row, "COMMENTS"), + ) + tasks.append(current) + if current is None: + # Continuation row appearing before any task ID: silently dropping it + # used to mask malformed task files. Warn (and skip) so the user sees it. + if any(cell(row, c) for c in _DATA_COLS): + warnings.warn( + f"{path}: row {row_no} carries task data but no task ID has " + "been seen yet; the row is being skipped.", + stacklevel=2, + ) + continue + _add_row(current, row, cell) + return tasks + + +def _add_row(task: Task, row: list[str], cell) -> None: + if inp := cell(row, "IN"): + lb, ub = _num(cell(row, "IN LB"), 0.0), _num(cell(row, "IN UB"), 1000.0) + task.inputs += [(m.strip(), lb, ub) for m in inp.split(";") if m.strip()] + if out := cell(row, "OUT"): + lb, ub = _num(cell(row, "OUT LB"), 0.0), _num(cell(row, "OUT UB"), 1000.0) + task.outputs += [(m.strip(), lb, ub) for m in out.split(";") if m.strip()] + if equ := cell(row, "EQU"): + lb = _num(cell(row, "EQU LB"), -1000.0 if "<=>" in equ else 0.0) + ub = _num(cell(row, "EQU UB"), 1000.0) + task.equations.append((equ.strip(), lb, ub)) + if chg := cell(row, "CHANGED RXN"): + lb, ub = _num(cell(row, "CHANGED LB"), -1000.0), _num(cell(row, "CHANGED UB"), 1000.0) + task.changed += [(r.strip(), lb, ub) for r in chg.split(";") if r.strip()] diff --git a/src/raven_python/utils/__init__.py b/src/raven_python/utils/__init__.py new file mode 100644 index 0000000..7127bdd --- /dev/null +++ b/src/raven_python/utils/__init__.py @@ -0,0 +1,16 @@ +"""Shared helpers — GPR linting, elemental balance, model curation checks, id sorting.""" +from raven_python.utils.balance import ElementalBalance, get_elemental_balance +from raven_python.utils.gpr import GPRIssue, find_non_dnf_grrules, is_dnf +from raven_python.utils.sort import sort_identifiers +from raven_python.utils.validate import ModelIssue, check_model + +__all__ = [ + "ElementalBalance", + "GPRIssue", + "ModelIssue", + "check_model", + "find_non_dnf_grrules", + "get_elemental_balance", + "is_dnf", + "sort_identifiers", +] diff --git a/src/raven_python/utils/balance.py b/src/raven_python/utils/balance.py new file mode 100644 index 0000000..ee64ab4 --- /dev/null +++ b/src/raven_python/utils/balance.py @@ -0,0 +1,89 @@ +"""Check the elemental balance of reactions, distinguishing *unbalanced* from +*unknown* (missing formula). + +cobra's ``reaction.check_mass_balance()`` silently treats a missing formula as +empty, so a reaction can look "unbalanced" — or even balanced — when the truth is +that the data is incomplete. This module checks for missing formulas first and +returns a graded status +per reaction (``balanced`` / ``unbalanced`` / ``unknown``) plus the element +imbalance — over a batch, as structured data. +""" +from __future__ import annotations + +from dataclasses import dataclass, field + +import cobra + + +@dataclass(frozen=True) +class ElementalBalance: + """Balance result for one reaction. + + Attributes + ---------- + reaction_id + ID of the reaction. + status + ``"balanced"`` — elements balance; + ``"unbalanced"`` — they do not (see ``imbalance``); + ``"unknown"`` — at least one metabolite has no formula, so it cannot be + determined (cobra would silently miscount these). + imbalance + Element → net coefficient (products − reactants), only for + ``"unbalanced"``; empty otherwise. Charge is not included. + """ + + reaction_id: str + status: str + imbalance: dict[str, float] = field(default_factory=dict) + + +def get_elemental_balance( + model: cobra.Model, reactions=None +) -> list[ElementalBalance]: + """Check whether reactions are elementally balanced. + Parameters + ---------- + reactions + Reaction IDs/objects to check; default all reactions. (Boundary + reactions exchange mass with the environment and will read as + ``unbalanced`` — filter them out if that is not wanted.) + + Returns + ------- + list of ElementalBalance + One entry per checked reaction, in model order. + """ + if reactions is None: + rxns = list(model.reactions) + else: + if isinstance(reactions, (str, cobra.Reaction)): + reactions = [reactions] + rxns = [ + r if isinstance(r, cobra.Reaction) else model.reactions.get_by_id(r) + for r in reactions + ] + + results: list[ElementalBalance] = [] + for rxn in rxns: + if not rxn.metabolites: + # A reaction with no metabolites used to fall through to ``balanced`` + # (vacuously) because ``any()`` over the empty list is False and the + # zero-element imbalance dict is empty. Treat the no-formula case + # (zero formulae present) as ``unknown``: we can't determine balance + # for a reaction without stoichiometry. + results.append(ElementalBalance(rxn.id, "unknown")) + continue + if any(not met.formula for met in rxn.metabolites): + results.append(ElementalBalance(rxn.id, "unknown")) + continue + imbalance = { + element: amount + for element, amount in rxn.check_mass_balance().items() + if element != "charge" + } + if imbalance: + results.append(ElementalBalance(rxn.id, "unbalanced", imbalance)) + else: + results.append(ElementalBalance(rxn.id, "balanced")) + return results diff --git a/src/raven_python/utils/gpr.py b/src/raven_python/utils/gpr.py new file mode 100644 index 0000000..2e2122d --- /dev/null +++ b/src/raven_python/utils/gpr.py @@ -0,0 +1,119 @@ +"""GPR (gene-protein-reaction rule) linting. + +Flag GPRs that are *not* in disjunctive normal form ("OR of AND-complexes"), via cobra's +GPR AST. GPR syntax *normalisation* is already done by cobra on assignment, so it isn't +re-implemented here. + +Part (2) has no cobrapy equivalent and is ported here, reworked onto cobra's +GPR AST instead of RAVEN's brittle substring search. The relevant property is +**disjunctive normal form (DNF)**: an OR of AND-clauses of single genes, e.g. +``(G1 and G2) or G3``. Rules where an AND contains an OR — e.g. +``(G1 or G2) and (G3 or G4)`` — are *valid* for cobra but ambiguous for the +isoenzyme/complex reasoning used across RAVEN/GECKO, and ``expand_model`` +(see :mod:`raven_python.manipulation.expand`) only does something for DNF rules. +:func:`find_non_dnf_grrules` surfaces them as structured data rather than, as +RAVEN did, only printing a warning. +""" +from __future__ import annotations + +import ast +from dataclasses import dataclass + +import cobra +from cobra.core.gene import GPR + + +def _contains_or(node: ast.AST | None) -> bool: + """True if ``node``'s subtree contains an OR operator anywhere.""" + if isinstance(node, ast.BoolOp): + if isinstance(node.op, ast.Or): + return True + return any(_contains_or(value) for value in node.values) + return False + + +def _is_dnf_node(node: ast.AST | None) -> bool: + """True if the AST rooted at ``node`` is in disjunctive normal form. + + DNF here means no AND operator has an OR anywhere beneath it, i.e. the + rule is a single gene, a pure AND-complex, or an OR of those. + """ + if node is None or isinstance(node, ast.Name): + return True + if isinstance(node, ast.BoolOp): + if isinstance(node.op, ast.And): + return not any(_contains_or(value) for value in node.values) + # OR: every disjunct must itself be DNF + return all(_is_dnf_node(value) for value in node.values) + # Unknown node type: don't flag it as a problem. + return True + + +def is_dnf(gpr: GPR | str | None) -> bool: + """Return whether a GPR is in disjunctive normal form (OR of AND-complexes). + + Parameters + ---------- + gpr + A cobra :class:`~cobra.core.gene.GPR`, a grRule string, or ``None``. + An empty/``None`` rule is trivially DNF. + + Examples + -------- + >>> is_dnf("(G1 and G2) or G3") + True + >>> is_dnf("(G1 or G2) and G3") + False + """ + if isinstance(gpr, str): + gpr = GPR.from_string(gpr) + if gpr is None: + return True + return _is_dnf_node(gpr.body) + + +@dataclass(frozen=True) +class GPRIssue: + """A reaction whose GPR is flagged by the linter. + + Attributes + ---------- + reaction_id + ID of the reaction. + gpr + The (already cobra-normalised) grRule string. + reason + Human-readable explanation of why it was flagged. + """ + + reaction_id: str + gpr: str + reason: str + + +_NON_DNF_REASON = ( + "GPR is not in disjunctive normal form (an AND clause contains an OR). " + "Isoenzyme/complex reasoning and expand_model assume an OR of AND-complexes, " + 'e.g. rewrite "(G1 or G2) and (G3 or G4)" as ' + '"(G1 and G3) or (G1 and G4) or (G2 and G3) or (G2 and G4)".' +) + + +def find_non_dnf_grrules(model: cobra.Model) -> list[GPRIssue]: + """Find reactions whose GPR is not in disjunctive normal form ("OR of AND-complexes"). + + Uses cobra's GPR AST. Reactions with no GPR are skipped. + + Returns + ------- + list of GPRIssue + One entry per flagged reaction, in model reaction order. Empty if all + GPRs are simple OR-of-AND-complexes. + """ + issues: list[GPRIssue] = [] + for rxn in model.reactions: + if not rxn.gene_reaction_rule: + continue + if not is_dnf(rxn.gpr): + issues.append(GPRIssue(rxn.id, rxn.gene_reaction_rule, _NON_DNF_REASON)) + return issues diff --git a/src/raven_python/utils/parse.py b/src/raven_python/utils/parse.py new file mode 100644 index 0000000..8068f6c --- /dev/null +++ b/src/raven_python/utils/parse.py @@ -0,0 +1,33 @@ +"""Small parsing helpers shared across raven_python.""" +from __future__ import annotations + +import re + +# A metabolite written as ``name[comp]``. The name is greedy so that, for a +# pathological name that itself contains brackets, the *last* ``[...]`` is taken +# as the compartment (matching RAVEN getIndexes' ``max(strfind('['))`` rule). +_NAME_COMP_RE = re.compile(r"^(?P.+)\[(?P[^\[\]]+)\]$") + + +def parse_name_comp(token: str) -> tuple[str, str | None]: + """Split a ``name[comp]`` token into ``(name, compartment)``. + + This is the one genuinely cobra-absent sliver of RAVEN ``getIndexes``' + ``metcomps`` mode and ``addRxns`` eqnType 3: resolving a metabolite written + as its *name* plus a compartment in square brackets, e.g. ``"ATP[c]"``. + + Returns ``(name, None)`` when there is no trailing ``[...]``. + + Examples + -------- + >>> parse_name_comp("ATP[c]") + ('ATP', 'c') + >>> parse_name_comp("ATP") + ('ATP', None) + >>> parse_name_comp("weird[name][m]") + ('weird[name]', 'm') + """ + match = _NAME_COMP_RE.match(token.strip()) + if match: + return match.group("name").strip(), match.group("comp").strip() + return token.strip(), None diff --git a/src/raven_python/utils/sort.py b/src/raven_python/utils/sort.py new file mode 100644 index 0000000..a8641a8 --- /dev/null +++ b/src/raven_python/utils/sort.py @@ -0,0 +1,21 @@ +"""Sort a model's identifiers alphabetically — useful for deterministic, +diff-friendly output. + +cobra's ``DictList.sort`` reorders one list (and rebuilds its lookup index), but +there is no single "sort the whole model" call; this provides it. +""" +from __future__ import annotations + +import cobra + + +def sort_identifiers(model: cobra.Model) -> cobra.Model: + """Sort reactions, metabolites and genes alphabetically by ID, in place. + + Returns the same (mutated) model for convenience. Compartments are a plain + dict and are emitted sorted by writers as needed. + """ + model.reactions.sort(key=lambda r: r.id) + model.metabolites.sort(key=lambda m: m.id) + model.genes.sort(key=lambda g: g.id) + return model diff --git a/src/raven_python/utils/validate.py b/src/raven_python/utils/validate.py new file mode 100644 index 0000000..c08df48 --- /dev/null +++ b/src/raven_python/utils/validate.py @@ -0,0 +1,86 @@ +"""Curation checks for a model. + +A QC bundle cobra has no single call for: orphaned objects, empty reactions, +duplicated metabolite ``name + compartment``, empty names, and objective sanity. +:func:`check_model` returns these as structured :class:`ModelIssue` records. +""" +from __future__ import annotations + +from dataclasses import dataclass + +import cobra + + +@dataclass(frozen=True) +class ModelIssue: + """One curation issue found in a model. + + Attributes + ---------- + category + Machine-readable kind, e.g. ``"orphan_metabolite"``, ``"empty_reaction"``, + ``"orphan_gene"``, ``"duplicate_name_compartment"``, + ``"empty_metabolite_name"``, ``"objective"``. + object_id + ID of the offending object, or ``None`` for model-level issues. + message + Human-readable description. + """ + + category: str + object_id: str | None + message: str + + +def check_model(model: cobra.Model) -> list[ModelIssue]: + """Run curation checks on a model and return the issues found. + + Does not + raise; returns a (possibly empty) list of :class:`ModelIssue`. + """ + issues: list[ModelIssue] = [] + + for met in model.metabolites: + if not met.reactions: + issues.append( + ModelIssue("orphan_metabolite", met.id, f"Metabolite {met.id!r} is not used in any reaction.") + ) + if not (met.name and str(met.name).strip()): + issues.append( + ModelIssue("empty_metabolite_name", met.id, f"Metabolite {met.id!r} has no name.") + ) + + for gene in model.genes: + if not gene.reactions: + issues.append( + ModelIssue("orphan_gene", gene.id, f"Gene {gene.id!r} is not associated with any reaction.") + ) + + for rxn in model.reactions: + if not rxn.metabolites: + issues.append( + ModelIssue("empty_reaction", rxn.id, f"Reaction {rxn.id!r} has no metabolites.") + ) + + by_name_comp: dict[tuple[str, str], list[str]] = {} + for met in model.metabolites: + by_name_comp.setdefault((met.name, met.compartment), []).append(met.id) + for (name, comp), ids in by_name_comp.items(): + if name and len(ids) > 1: + issues.append( + ModelIssue( + "duplicate_name_compartment", + None, + f"{len(ids)} metabolites share name {name!r} in compartment {comp!r}: {sorted(ids)}", + ) + ) + + objective_rxns = [r.id for r in model.reactions if r.objective_coefficient != 0] + if not objective_rxns: + issues.append(ModelIssue("objective", None, "No reaction has a nonzero objective coefficient.")) + elif len(objective_rxns) > 1: + issues.append( + ModelIssue("objective", None, f"Multiple objective reactions: {sorted(objective_rxns)}") + ) + + return issues diff --git a/tests/data/kegg_dump/compound b/tests/data/kegg_dump/compound new file mode 100644 index 0000000..a78d176 --- /dev/null +++ b/tests/data/kegg_dump/compound @@ -0,0 +1,34 @@ +ENTRY C00001 Compound +NAME H2O; + Water +FORMULA H2O +DBLINKS PubChem: 3303 + ChEBI: 15377 +/// +ENTRY C00002 Compound +NAME ATP +FORMULA C10H16N5O13P3 +/// +ENTRY C00003 Compound +NAME NAD+; + NAD +FORMULA C21H28N7O14P2 +/// +ENTRY C00006 Compound +NAME NADP+ +FORMULA C21H29N7O17P3 +/// +ENTRY C00031 Compound +NAME D-Glucose; + Grape sugar +FORMULA C6H12O6 +DBLINKS ChEBI: 4167 17634 +/// +ENTRY C01083 Compound +NAME alpha,alpha-Trehalose +FORMULA C12H22O11 +/// +ENTRY C00007 Compound +NAME Oxygen +FORMULA O2 +/// diff --git a/tests/data/kegg_dump/compound.inchi b/tests/data/kegg_dump/compound.inchi new file mode 100644 index 0000000..448312f --- /dev/null +++ b/tests/data/kegg_dump/compound.inchi @@ -0,0 +1 @@ +C00031 InChI=1S/C6H12O6/c7-1-2-3(8)4(9)5(10)6(11)12-2/h2-11H,1H2 diff --git a/tests/data/kegg_dump/genes.pep b/tests/data/kegg_dump/genes.pep new file mode 100644 index 0000000..f30073d --- /dev/null +++ b/tests/data/kegg_dump/genes.pep @@ -0,0 +1,12 @@ +>bsu:BSU31050 gbsB; choline dehydrogenase +MKVLAAGGTGYIGSHTVVELLEAGYDVVVLDNLSNGHREAVPKGVPFveqIDLRDREALDR +>bsu:BSU31060 hypothetical protein +MKVLAAGGTGYIGSHTVVELLEAGYDVVVLDNLSNGHREAVPKGVPFveqIDLRDREALDX +>eco:b0001 thrA; aspartokinase +MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNHLVAMIEKTISGQDA +>hsa:124 ADH1A; alcohol dehydrogenase 1A +MSTAGKVIKCKAAVLWELKKPFSIEEVEVAPPKAHEVRIKMVATGICRSDDHVVSGTLVT +>hsa:125 ADH1B; alcohol dehydrogenase 1B +MSTAGKVIKCKAAVLWEVKKPFSIEDVEVAPPKAHEVRIKMVATGICRSDDHVVSGTLVT +>xxx:unused some other gene not in any KO +MAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA diff --git a/tests/data/kegg_dump/ko b/tests/data/kegg_dump/ko new file mode 100644 index 0000000..f6ae027 --- /dev/null +++ b/tests/data/kegg_dump/ko @@ -0,0 +1,14 @@ +ENTRY K01194 KO +NAME treA, TREH +DEFINITION alpha,alpha-trehalase [EC:3.2.1.28] +GENES BSU: BSU31050(gbsB) BSU31060 + HSA: 124 125(ADH) +/// +ENTRY K00002 KO +DEFINITION AKR1A1; alcohol dehydrogenase (NADP+) [EC:1.1.1.2] +GENES ECO: b0001 +/// +ENTRY K99999 KO +DEFINITION unlinked ortholog +GENES ECO: b9999 +/// diff --git a/tests/data/kegg_dump/reaction b/tests/data/kegg_dump/reaction new file mode 100644 index 0000000..b0e75c5 --- /dev/null +++ b/tests/data/kegg_dump/reaction @@ -0,0 +1,31 @@ +ENTRY R00010 Reaction +NAME alpha,alpha-trehalose glucohydrolase +DEFINITION alpha,alpha-Trehalose + H2O <=> 2 D-Glucose +EQUATION C01083 + C00001 <=> 2 C00031 +ENZYME 3.2.1.28 +PATHWAY rn00500 Starch and sucrose metabolism + rn01100 Metabolic pathways +MODULE M00599 example module +ORTHOLOGY K01194 alpha,alpha-trehalase [EC:3.2.1.28] +DBLINKS RHEA: 32678 +/// +ENTRY R00100 Reaction +NAME spontaneous example +COMMENT This reaction is spontaneous. +EQUATION C00002 <=> C00003 +ORTHOLOGY K00002 some enzyme +/// +ENTRY R00200 Reaction +NAME undefined stoich example +EQUATION C00001 + n C00002 <=> C00003 +/// +ENTRY R00300 Reaction +NAME general example +COMMENT General reaction. +EQUATION C00031 <=> C00006 +ORTHOLOGY K09999 lumped ortholog +/// +ENTRY R00400 Reaction +NAME empty after cancellation +EQUATION C00007 <=> C00007 +/// diff --git a/tests/data/kegg_dump/reaction_mapformula.lst b/tests/data/kegg_dump/reaction_mapformula.lst new file mode 100644 index 0000000..0adb8f0 --- /dev/null +++ b/tests/data/kegg_dump/reaction_mapformula.lst @@ -0,0 +1,3 @@ +R00010: 00500: C01083 => C00031 +R00010: 00010: C00031 => C01083 +R00100: 00010: C00002 => C00003 diff --git a/tests/data/kegg_dump/taxonomy b/tests/data/kegg_dump/taxonomy new file mode 100644 index 0000000..f0447e6 --- /dev/null +++ b/tests/data/kegg_dump/taxonomy @@ -0,0 +1,10 @@ +# Prokaryotes +## Bacteria +### Firmicutes +T00010 bsu Bacillus subtilis 168 Bacillus +### Gammaproteobacteria - Enterobacteria +T00007 eco Escherichia coli K-12 MG1655 Escherichia +# Eukaryotes +## Animals +### Vertebrates - Mammals +T01001 hsa Homo sapiens (human) Homo diff --git a/tests/test_analysis_fseof.py b/tests/test_analysis_fseof.py new file mode 100644 index 0000000..5f23f3f --- /dev/null +++ b/tests/test_analysis_fseof.py @@ -0,0 +1,112 @@ +"""Tests for FSEOF (analysis/fseof.py, Phase 5).""" +import cobra +import pytest + +from raven_python.analysis import FSEOFResult, fseof + + +@pytest.fixture +def model(): + """S -> I, then I branches to product P (via v2) or biomass B (via v3). + + Enforcing product export (EX_P) should amplify the product branch (v1, v2) and + suppress the biomass branch (v3), which competes for the shared intermediate I. + """ + m = cobra.Model("cell") + S, inter, P, B = (cobra.Metabolite(x, compartment="c") for x in ("S", "I", "P", "B")) + m.add_metabolites([S, inter, P, B]) + sup = cobra.Reaction("sup", lower_bound=0, upper_bound=10) # -> S (substrate supply) + sup.add_metabolites({S: 1}) + v1 = cobra.Reaction("v1", lower_bound=0, upper_bound=1000) + v1.add_metabolites({S: -1, inter: 1}) + v2 = cobra.Reaction("v2", lower_bound=0, upper_bound=1000) + v2.add_metabolites({inter: -1, P: 1}) + v3 = cobra.Reaction("v3", lower_bound=0, upper_bound=1000) + v3.add_metabolites({inter: -1, B: 1}) + ex_p = cobra.Reaction("EX_P", lower_bound=0, upper_bound=1000) # target product export + ex_p.add_metabolites({P: -1}) + ex_b = cobra.Reaction("EX_B", lower_bound=0, upper_bound=1000) # biomass + ex_b.add_metabolites({B: -1}) + m.add_reactions([sup, v1, v2, v3, ex_p, ex_b]) + v1.gene_reaction_rule = "gA" + v2.gene_reaction_rule = "gB" + v3.gene_reaction_rule = "gC" + m.objective = "EX_B" + return m + + +def test_returns_result_with_scan(model): + res = fseof(model, "EX_P", n_steps=8) + assert isinstance(res, FSEOFResult) + assert res.scan.shape[1] == len(res.enforced) >= 2 + assert "v2" in res.scan.index # full scan retained, indexed by reaction + + +def test_amplification_targets(model): + res = fseof(model, "EX_P", n_steps=8) + amp = set(res.amplification["reaction"]) + # the product-forming reaction is amplified as EX_P is enforced upward + # (v1/sup run at capacity throughout, so they stay constant and aren't flagged). + assert {"v2", "EX_P"} <= amp + v2 = res.targets.set_index("reaction").loc["v2"] + assert v2["slope"] > 0 and v2["correlation"] > 0.9 + + +def test_knockdown_of_competing_branch(model): + res = fseof(model, "EX_P", n_steps=8) + # v3 (biomass branch) competes for I -> suppressed toward zero -> knockdown/knockout + down = set(res.knockout["reaction"]) + assert "v3" in down + v3 = res.targets.set_index("reaction").loc["v3"] + assert v3["slope"] < 0 + assert v3["target_type"] in ("knockdown", "knockout") + + +def test_gene_targets_aggregation(model): + res = fseof(model, "EX_P", n_steps=8) + genes = set(res.gene_targets["gene"]) + assert {"gA", "gB", "gC"} & genes # reaction targets mapped to their genes + gB = res.gene_targets.set_index("gene").loc["gB"] + assert "v2" in gB["reactions"] + + +def test_unproducible_target_raises(model): + # A reaction that cannot carry positive flux is not a valid product target. + dead = cobra.Reaction("dead", lower_bound=0, upper_bound=0) + dead.add_metabolites({model.metabolites.P: -1}) + model.add_reactions([dead]) + with pytest.raises(ValueError, match="cannot carry positive flux"): + fseof(model, "dead") + + +def test_infeasible_model_raises_clear_error(model): + """An infeasible model (slim_optimize -> NaN) raises the clear guard, not a NaN scan.""" + model.reactions.sup.bounds = (5, 5) # force uptake while EX_P demands more -> infeasible + model.reactions.EX_P.bounds = (1000, 1000) + with pytest.raises(ValueError, match="cannot carry positive flux"): + fseof(model, "EX_P", n_steps=4) + + +# --- regression: slope-based labels (known_issues.md F3) ------------------- + +def test_amplify_label_uses_abs_slope_not_endpoint_difference(): + """A reaction whose |flux| trend is upward but whose final value happens + to equal the initial (endpoints straddle a peak) should be labelled + ``amplify`` by the regression-slope rule, not ``knockdown`` by the old + endpoint check.""" + import numpy as np + import pandas as pd + + from raven_python.analysis.fseof import _classify + + # Endpoints equal (0), but the |flux| regression slope is clearly positive + # over the scan — the new classifier picks amplify; the old endpoint code + # would have said knockdown (final not below eps, abs(final) not > abs(initial)). + enforced = np.linspace(0.0, 1.0, 6) + flux = np.array([0.0, 0.3, 0.6, 0.9, 0.4, 0.0]) + scan = pd.DataFrame([flux], index=["r_test"], columns=enforced) + m = cobra.Model("synth") + m.add_reactions([cobra.Reaction("r_test")]) + table = _classify(m, scan, enforced, corr_threshold=0.0, flux_eps=1e-6) + assert not table.empty + assert table.iloc[0]["target_type"] == "amplify" diff --git a/tests/test_analysis_reporter.py b/tests/test_analysis_reporter.py new file mode 100644 index 0000000..918f15d --- /dev/null +++ b/tests/test_analysis_reporter.py @@ -0,0 +1,89 @@ +"""Tests for Reporter Metabolites (analysis/reporter.py, Phase 5).""" +import cobra +import pytest + +from raven_python.analysis import ReporterResult, reporter_metabolites + + +def _met(mid): + return cobra.Metabolite(mid, name=mid[:-2], compartment="c") + + +@pytest.fixture +def model(): + """A-r1(g1)-B-r2(g2)-C-r3(g3); rX touches X but has no gene.""" + m = cobra.Model("rep") + A, B, C, X = _met("A_c"), _met("B_c"), _met("C_c"), _met("X_c") + m.add_metabolites([A, B, C, X]) + r1 = cobra.Reaction("r1") + r1.add_metabolites({A: -1, B: 1}) + r2 = cobra.Reaction("r2") + r2.add_metabolites({B: -1, C: 1}) + r3 = cobra.Reaction("r3") + r3.add_metabolites({C: -1}) + rX = cobra.Reaction("rX") + rX.add_metabolites({X: -1}) + m.add_reactions([r1, r2, r3, rX]) + r1.gene_reaction_rule = "g1" + r2.gene_reaction_rule = "g2" + r3.gene_reaction_rule = "g3" + return m + + +def test_ranks_metabolites_by_surrounding_significance(model): + # g1, g2 highly significant; g3 not. B (g1,g2) > A (g1) > C (g2,g3). + (res,) = reporter_metabolites(model, {"g1": 0.001, "g2": 0.001, "g3": 0.5}) + assert isinstance(res, ReporterResult) and res.test == "all" + assert list(res.table["metabolite"]) == ["B_c", "A_c", "C_c"] + assert res.table["z_score"].is_monotonic_decreasing + assert "X_c" not in set(res.table["metabolite"]) # no neighbouring genes -> excluded + + +def test_neighbour_counts(model): + (res,) = reporter_metabolites(model, {"g1": 0.01, "g2": 0.01, "g3": 0.01}) + counts = dict(zip(res.table["metabolite"], res.table["n_genes"], strict=True)) + assert counts == {"A_c": 1, "B_c": 2, "C_c": 2} + + +def test_uniform_pvalues_give_zero_scores(model): + # All genes identical -> background std 0 -> nothing stands out (corrected z = 0). + (res,) = reporter_metabolites(model, {"g1": 0.2, "g2": 0.2, "g3": 0.2}) + assert (res.table["z_score"] == 0.0).all() + assert res.table["p_value"].to_numpy() == pytest.approx(0.5) + + +def test_p_value_low_for_top_metabolite(model): + (res,) = reporter_metabolites(model, {"g1": 1e-6, "g2": 1e-6, "g3": 0.9}) + top = res.table.iloc[0] + assert top["metabolite"] == "B_c" + assert top["p_value"] < 0.5 # enriched -> significant + + +def test_fold_change_splits_up_down(model): + res = reporter_metabolites( + model, + {"g1": 0.001, "g2": 0.001, "g3": 0.001}, + gene_fold_changes={"g1": 2.0, "g2": -2.0, "g3": 1.0}, + ) + assert [r.test for r in res] == ["all", "up", "down"] + # 'up' uses g1,g3 -> A(g1) and C(g3) have neighbours; B needs g2 (down) so its + # only 'up' neighbour is g1 -> still present. 'down' uses only g2. + down = next(r for r in res if r.test == "down").table + assert set(down["metabolite"]) <= {"B_c", "C_c"} # g2 touches B and C + + +def test_filters_unknown_and_nan_genes(model): + # gX not in model, gNaN has NaN p-value -> both ignored; result still computed. + (res,) = reporter_metabolites( + model, {"g1": 0.01, "g2": 0.01, "g3": 0.01, "gX": 0.001, "gNaN": float("nan")} + ) + assert "gX" not in set(model.genes.list_attr("id")) # sanity + assert len(res.table) == 3 # A, B, C scored from the three real genes + + +def test_out_of_range_pvalue_dropped_not_poisoning(model): + """A p-value outside [0,1] is dropped, not propagated as NaN through all scores.""" + (res,) = reporter_metabolites(model, {"g1": 0.01, "g2": 0.01, "g3": 1.7}) # g3 invalid + import numpy as np + + assert not np.isnan(res.table["z_score"].to_numpy()).any() # no NaN poisoning diff --git a/tests/test_analysis_sampling.py b/tests/test_analysis_sampling.py new file mode 100644 index 0000000..155500b --- /dev/null +++ b/tests/test_analysis_sampling.py @@ -0,0 +1,133 @@ +"""Tests for random-objective flux sampling (analysis/sampling.py).""" +import cobra +import numpy as np +import pytest + +from raven_python.analysis import ( + RandomSamplingResult, + find_good_reactions, + random_sampling, +) + + +@pytest.fixture +def model(): + """S uptake -> A -> {B export, C export}, plus a thermodynamically infeasible loop. + + sup -> A; A->B (v_b) and A->C (v_c); B,C exported. r_f/r_r form a closed cycle + (X<->Y both directions, no in/out) that can spin arbitrarily — a loop whose + reactions must be excluded from the random objectives. + """ + m = cobra.Model("toy") + A, B, C, X, Y = (cobra.Metabolite(x, compartment="c") for x in "ABCXY") + m.add_metabolites([A, B, C, X, Y]) + + def rxn(rid, mets, lb=0, ub=1000): + r = cobra.Reaction(rid, lower_bound=lb, upper_bound=ub) + r.add_metabolites(mets) + return r + + rxns = [ + rxn("sup", {A: 1}, ub=10), # substrate supply + rxn("v_b", {A: -1, B: 1}), # A -> B + rxn("v_c", {A: -1, C: 1}), # A -> C + rxn("EX_B", {B: -1}), # export B + rxn("EX_C", {C: -1}), # export C + rxn("r_f", {X: -1, Y: 1}, lb=-1000), # X <-> Y ┐ closed loop + rxn("r_r", {Y: -1, X: 1}, lb=-1000), # Y <-> X ┘ (no source/sink for X,Y) + ] + m.add_reactions(rxns) + m.objective = "EX_B" + return m + + +def test_good_reactions_excludes_loop(model): + good = find_good_reactions(model) + # The closed X<->Y cycle can spin to the 1000 bound -> excluded. + assert "r_f" not in good and "r_r" not in good + # Real flux-carrying reactions are kept. + assert {"sup", "v_b", "EX_B"} <= set(good) + + +def test_returns_result_shape(model): + res = random_sampling(model, n_samples=20, seed=1) + assert isinstance(res, RandomSamplingResult) + assert res.samples.shape == (20, len(model.reactions)) + assert list(res.samples.columns) == [r.id for r in model.reactions] + assert "r_f" not in res.good_reactions + + +def test_samples_are_steady_state(model): + """Every sample must satisfy S·v = 0 (mass balance).""" + res = random_sampling(model, n_samples=15, seed=2) + s_matrix = cobra.util.create_stoichiometric_matrix(model) + ids = [r.id for r in model.reactions] + for _, row in res.samples.iterrows(): + residual = s_matrix @ row[ids].to_numpy() + assert np.allclose(residual, 0, atol=1e-6) + + +def test_samples_respect_bounds(model): + res = random_sampling(model, n_samples=15, seed=3) + for r in model.reactions: + col = res.samples[r.id].to_numpy() + assert (col >= r.lower_bound - 1e-6).all() + assert (col <= r.upper_bound + 1e-6).all() + + +def test_seed_is_reproducible(model): + a = random_sampling(model, n_samples=10, seed=42).samples + b = random_sampling(model, n_samples=10, seed=42).samples + assert np.allclose(a.to_numpy(), b.to_numpy()) + + +def test_good_reactions_reused(model): + """Passing good_reactions back in reproduces the FVA-derived set without recomputing.""" + good = find_good_reactions(model) + res = random_sampling(model, n_samples=5, good_reactions=good, seed=0) + assert res.good_reactions == good + + +def test_min_flux_runs(model): + res = random_sampling(model, n_samples=8, min_flux=True, seed=5) + assert res.samples.shape == (8, len(model.reactions)) + + +def test_diverse_samples(model): + """Random objectives should explore different states, not a single FBA optimum.""" + res = random_sampling(model, n_samples=40, seed=7) + # The branch split A->B vs A->C should vary across samples. + assert res.samples["v_b"].std() > 1e-6 + assert res.samples["v_c"].std() > 1e-6 + + +def test_rejects_bad_n_samples(model): + with pytest.raises(ValueError, match="n_samples"): + random_sampling(model, n_samples=0) + + +def test_too_few_good_reactions(model): + with pytest.raises(ValueError, match="usable reactions"): + random_sampling(model, n_samples=5, good_reactions=["sup"], n_objectives=2) + + +def test_good_reactions_keeps_reactions_at_default_bound(): + """A legitimate reaction reaching the model's 1000 bound is not dropped as a loop. + + Regression: the old loop_bound>=1000 test wrongly excluded any reaction that + reaches the default bound. Loopless FVA keeps it (real flux) and still drops a + closed loop. + """ + m = cobra.Model("b") + a, b = (cobra.Metabolite(x, compartment="c") for x in "ab") + m.add_metabolites([a, b]) + sup = cobra.Reaction("sup", lower_bound=0, upper_bound=1000) # uptake to the 1000 cap + sup.add_metabolites({a: 1}) + conv = cobra.Reaction("conv", lower_bound=0, upper_bound=1000) + conv.add_metabolites({a: -1, b: 1}) + ex = cobra.Reaction("EX_b", lower_bound=0, upper_bound=1000) + ex.add_metabolites({b: -1}) + m.add_reactions([sup, conv, ex]) + m.objective = "EX_b" + good = find_good_reactions(m) + assert {"sup", "conv", "EX_b"} <= set(good) # all reach 1000 but are real, not loops diff --git a/tests/test_binaries.py b/tests/test_binaries.py new file mode 100644 index 0000000..d74ce0b --- /dev/null +++ b/tests/test_binaries.py @@ -0,0 +1,80 @@ +"""Tests for raven_python.binaries (binary resolution + bundled-ZIP provisioning).""" +import hashlib +import shutil +import zipfile +from pathlib import Path + +import pytest + +from raven_python import binaries + + +def test_resolve_explicit_path(): + assert binaries.resolve_binary("blastp", binary="/opt/x/blastp") == "/opt/x/blastp" + + +def test_resolve_env_var(monkeypatch): + monkeypatch.setenv("RAVEN_PYTHON_DIAMOND", "/custom/diamond") + assert binaries.resolve_binary("diamond") == "/custom/diamond" + + +@pytest.mark.skipif(not shutil.which("blastp"), reason="blastp not installed") +def test_resolve_via_path(): + assert binaries.resolve_binary("blastp") == shutil.which("blastp") + + +def test_resolve_unresolvable_raises(monkeypatch): + monkeypatch.setattr(shutil, "which", lambda _: None) + with pytest.raises(FileNotFoundError, match="Could not find"): + binaries.resolve_binary("diamond") # empty registry, not on PATH + + +def test_platform_key_format(): + key = binaries.platform_key() + assert "-" in key + os_part, arch = key.split("-", 1) + assert os_part in {"linux", "macos", "windows"} or os_part # tolerant + + +def test_ensure_binary_downloads_verifies_extracts(tmp_path, monkeypatch): + # Build a fake bundle ZIP containing an executable, served via file:// URL. + exe = tmp_path / "footool" + exe.write_text("#!/bin/sh\necho hi\n") + archive = tmp_path / "footool.zip" + with zipfile.ZipFile(archive, "w") as zf: + zf.write(exe, "footool") + sha = hashlib.sha256(archive.read_bytes()).hexdigest() + + registry = { + "footool": { + "version": "1.0", + "provides": ["footool"], + "platforms": {binaries.platform_key(): {"url": archive.as_uri(), "sha256": sha}}, + } + } + monkeypatch.setenv("XDG_CACHE_HOME", str(tmp_path / "cache")) + + path = binaries.ensure_binary("footool", registry=registry) + assert Path(path).exists() + assert Path(path).name == "footool" + # cached on second call (same path, no re-download needed) + assert binaries.ensure_binary("footool", registry=registry) == path + + +def test_ensure_binary_sha_mismatch(tmp_path, monkeypatch): + archive = tmp_path / "x.zip" + with zipfile.ZipFile(archive, "w") as zf: + zf.writestr("footool", "data") + registry = { + "footool": {"version": "1", "provides": ["footool"], + "platforms": {binaries.platform_key(): {"url": archive.as_uri(), "sha256": "deadbeef"}}} + } + monkeypatch.setenv("XDG_CACHE_HOME", str(tmp_path / "cache")) + with pytest.raises(ValueError, match="SHA256 mismatch"): + binaries.ensure_binary("footool", registry=registry) + + +def test_ensure_binary_unhosted_platform_raises(tmp_path): + registry = {"footool": {"version": "1", "provides": ["footool"], "platforms": {}}} + with pytest.raises(FileNotFoundError, match="No bundled"): + binaries.ensure_binary("footool", registry=registry) diff --git a/tests/test_change_grrules.py b/tests/test_change_grrules.py new file mode 100644 index 0000000..d33f723 --- /dev/null +++ b/tests/test_change_grrules.py @@ -0,0 +1,49 @@ +"""Tests for change_gene_reaction_rules (changeGrRules port).""" +import cobra +import pytest + +from raven_python.manipulation import add_reactions_from_equations, change_gene_reaction_rules + + +@pytest.fixture +def model(): + m = cobra.Model("t") + m.add_metabolites( + [cobra.Metabolite("a_c", compartment="c"), cobra.Metabolite("b_c", compartment="c")] + ) + add_reactions_from_equations( + m, + [ + {"id": "R1", "equation": "a_c --> b_c", "gene_reaction_rule": "G1"}, + {"id": "R2", "equation": "a_c --> b_c"}, + ], + ) + return m + + +def test_replace_rule_and_create_genes(model): + (rxn,) = change_gene_reaction_rules(model, {"R1": "G2 and G3"}) + assert rxn.gene_reaction_rule == "G2 and G3" + assert {g.id for g in rxn.genes} == {"G2", "G3"} + assert {"G2", "G3"} <= {g.id for g in model.genes} + + +def test_append_rule(model): + change_gene_reaction_rules(model, {"R1": "G4"}, replace=False) + # (G1) or (G4), normalised by cobra + assert model.reactions.get_by_id("R1").gene_reaction_rule == "G1 or G4" + + +def test_append_when_empty_is_just_new(model): + change_gene_reaction_rules(model, {"R2": "G5"}, replace=False) + assert model.reactions.get_by_id("R2").gene_reaction_rule == "G5" + + +def test_batch(model): + changed = change_gene_reaction_rules(model, {"R1": "GA", "R2": "GB"}) + assert [r.id for r in changed] == ["R1", "R2"] + + +def test_unknown_reaction_errors(model): + with pytest.raises(ValueError, match="not found"): + change_gene_reaction_rules(model, {"NOPE": "G1"}) diff --git a/tests/test_comparison.py b/tests/test_comparison.py new file mode 100644 index 0000000..26a20f0 --- /dev/null +++ b/tests/test_comparison.py @@ -0,0 +1,123 @@ +"""Tests for comparison/compare.py — N-model comparison (Phase 5).""" +from __future__ import annotations + +import cobra +import pytest + +from raven_python.comparison import ModelComparison, compare_models +from raven_python.tasks import Task + + +def _mk(model_id: str, reactions: list[tuple[str, dict[str, int], str | None]], + genes: list[str] | None = None) -> cobra.Model: + """Tiny helper: build a model with the given reactions + optional gene rules + subsystems.""" + m = cobra.Model(model_id) + mets: dict[str, cobra.Metabolite] = {} + for _rid, stoich, _ in reactions: + for mid in stoich: + if mid not in mets: + mets[mid] = cobra.Metabolite(mid, name=mid.split("_")[0], compartment="c") + m.add_metabolites([mets[mid]]) + for (rid, stoich, sub), gpr in zip(reactions, genes or [None] * len(reactions), strict=True): + r = cobra.Reaction(rid, lower_bound=-1000, upper_bound=1000) + r.add_metabolites({mets[mid]: c for mid, c in stoich.items()}) + if sub is not None: + r.subsystem = sub + if gpr is not None: + r.gene_reaction_rule = gpr + m.add_reactions([r]) + return m + + +@pytest.fixture +def two_models(): + """Two models sharing r1/r2 but each with one unique reaction; different subsystems.""" + a = _mk("A", [("r1", {"A_c": -1, "B_c": 1}, "carbo"), + ("r2", {"B_c": -1, "C_c": 1}, "amino"), + ("r3", {"C_c": -1, "D_c": 1}, "carbo")], + genes=["g1", "g2", "g3"]) + b = _mk("B", [("r1", {"A_c": -1, "B_c": 1}, "carbo"), + ("r2", {"B_c": -1, "C_c": 1}, "amino"), + ("r4", {"B_c": -1, "E_c": 1}, "lipid")], + genes=["g1", "g2", "g4"]) + return [a, b] + + +def test_returns_dataclass(two_models): + res = compare_models(two_models) + assert isinstance(res, ModelComparison) + assert res.model_ids == ["A", "B"] + + +def test_reactions_matrix_shape_and_values(two_models): + res = compare_models(two_models) + # union = {r1, r2, r3, r4}; both have r1+r2, only A has r3, only B has r4. + assert set(res.reactions.index) == {"r1", "r2", "r3", "r4"} + assert res.reactions.loc["r1", "A"] == 1 and res.reactions.loc["r1", "B"] == 1 + assert res.reactions.loc["r3", "A"] == 1 and res.reactions.loc["r3", "B"] == 0 + assert res.reactions.loc["r4", "A"] == 0 and res.reactions.loc["r4", "B"] == 1 + + +def test_metabolites_and_genes_union(two_models): + res = compare_models(two_models) + assert set(res.metabolites.index) == {"A_c", "B_c", "C_c", "D_c", "E_c"} + assert set(res.genes.index) == {"g1", "g2", "g3", "g4"} + assert res.genes.loc["g3", "A"] == 1 and res.genes.loc["g3", "B"] == 0 + + +def test_subsystems_counts(two_models): + res = compare_models(two_models) + # A: carbo=2 (r1+r3), amino=1; B: carbo=1, amino=1, lipid=1. + assert res.subsystems.loc["carbo", "A"] == 2 + assert res.subsystems.loc["carbo", "B"] == 1 + assert res.subsystems.loc["lipid", "B"] == 1 + assert res.subsystems.loc["lipid", "A"] == 0 + + +def test_subsystems_empty_falls_under_none(): + a = _mk("A", [("r1", {"X_c": -1, "Y_c": 1}, None)]) + b = _mk("B", [("r1", {"X_c": -1, "Y_c": 1}, "")]) + res = compare_models([a, b]) + assert res.subsystems.loc["(none)", "A"] == 1 + assert res.subsystems.loc["(none)", "B"] == 1 + + +def test_jaccard_similarity_diagonal_and_symmetry(two_models): + res = compare_models(two_models) + # Diagonal = 1 (self vs self). + assert res.similarity.loc["A", "A"] == 1.0 + assert res.similarity.loc["B", "B"] == 1.0 + # Symmetric. + assert res.similarity.loc["A", "B"] == res.similarity.loc["B", "A"] + # Shared r1+r2; total union 4 → Jaccard 2/4 = 0.5. + assert res.similarity.loc["A", "B"] == pytest.approx(0.5) + + +def test_tasks_optional_and_passed_through(two_models): + """Both models export E → expect both to pass the make-E task.""" + # Add a sink so E can be excreted (otherwise it accumulates → infeasible at steady state). + for m in two_models: + if "E_c" in [x.id for x in m.metabolites]: + m.add_boundary(m.metabolites.get_by_id("E_c"), type="demand") + res = compare_models(two_models, tasks=[ + Task(id="make_E", inputs=[("A[c]", 0.0, 1000.0)], outputs=[("E[c]", 1.0, 1.0)]), + ]) + assert res.tasks is not None + assert list(res.tasks.index) == ["make_E"] + # Only B has r4 (which makes E), so only B passes. + assert bool(res.tasks.loc["make_E", "B"]) is True + assert bool(res.tasks.loc["make_E", "A"]) is False + + +def test_duplicate_or_missing_model_id_disambiguated(): + """Two models with the same id (or empty id) should get distinct labels.""" + a = _mk("", [("r1", {"X_c": -1, "Y_c": 1}, None)]) + b = _mk("", [("r1", {"X_c": -1, "Y_c": 1}, None)]) + res = compare_models([a, b]) + assert res.model_ids[0] == "model_0" + assert res.model_ids[1] != "model_0" # disambiguated + + +def test_rejects_single_model(two_models): + with pytest.raises(ValueError, match="needs .*2"): + compare_models(two_models[:1]) diff --git a/tests/test_data.py b/tests/test_data.py new file mode 100644 index 0000000..714c3a9 --- /dev/null +++ b/tests/test_data.py @@ -0,0 +1,89 @@ +"""Tests for ensure_data (data.py). Uses file:// URLs to avoid the network.""" +import hashlib + +import pytest + +from raven_python.data import ( + CORE_KEGG_FILES, + ensure_data_file, + ensure_kegg_data, +) + + +def _sha256(data: bytes) -> str: + return hashlib.sha256(data).hexdigest() + + +@pytest.fixture +def served(tmp_path, monkeypatch): + """A fake registry served from local files, with the cache pointed at tmp.""" + src = tmp_path / "src" + src.mkdir() + payloads = { + "reference_model.yml.gz": b"!!omap model bytes", + "ko_reaction.tsv.gz": b"ko\treaction\n", + "ko_names.tsv.gz": b"ko\tname\n", + "organism_gene_ko.tsv.xz": b"organism\tgene\tko\n", + "rxn_flags.tsv.gz": b"reaction\tspontaneous\n", + } + files = {} + for name, data in payloads.items(): + path = src / name + path.write_bytes(data) + files[name] = {"url": path.as_uri(), "sha256": _sha256(data)} + registry = {"kegg": {"version": "v1", "files": files}} + + cache = tmp_path / "cache" + monkeypatch.setenv("XDG_CACHE_HOME", str(cache)) + return registry, cache, payloads + + +def test_ensure_data_file_downloads_and_caches(served): + registry, cache, payloads = served + path = ensure_data_file("kegg", "ko_reaction.tsv.gz", registry=registry) + assert path == cache / "raven_python" / "data" / "kegg-v1" / "ko_reaction.tsv.gz" + assert path.read_bytes() == payloads["ko_reaction.tsv.gz"] + + +def test_ensure_data_file_reuses_cache(served, monkeypatch): + registry, _, _ = served + first = ensure_data_file("kegg", "ko_names.tsv.gz", registry=registry) + # Break the URL: a second call must hit the cache, not re-download. + registry["kegg"]["files"]["ko_names.tsv.gz"]["url"] = "file:///nonexistent" + second = ensure_data_file("kegg", "ko_names.tsv.gz", registry=registry) + assert first == second and second.exists() + + +def test_sha256_mismatch_rejected(served): + registry, cache, _ = served + registry["kegg"]["files"]["rxn_flags.tsv.gz"]["sha256"] = "0" * 64 + with pytest.raises(ValueError, match="SHA256 mismatch"): + ensure_data_file("kegg", "rxn_flags.tsv.gz", registry=registry) + # The corrupt partial download must not be left behind. + assert not (cache / "raven_python" / "data" / "kegg-v1" / "rxn_flags.tsv.gz").exists() + + +def test_unknown_dataset_actionable_error(served): + registry, _, _ = served + with pytest.raises(FileNotFoundError, match="No data artefacts registered"): + ensure_data_file("metacyc", "x", registry=registry) + + +def test_unknown_file_lists_available(served): + registry, _, _ = served + with pytest.raises(FileNotFoundError, match="not registered"): + ensure_data_file("kegg", "missing.tsv.gz", registry=registry) + + +def test_ensure_kegg_data_fetches_core_set(served): + registry, cache, _ = served + out = ensure_kegg_data(registry=registry) + assert out == cache / "raven_python" / "data" / "kegg-v1" + for name in CORE_KEGG_FILES: + assert (out / name).is_file() + + +def test_empty_registry_raises(): + # The shipped registry is empty until artefacts are published. + with pytest.raises(FileNotFoundError, match="No data artefacts registered"): + ensure_data_file("kegg", "ko_reaction.tsv.gz") diff --git a/tests/test_gapfilling.py b/tests/test_gapfilling.py new file mode 100644 index 0000000..b92e982 --- /dev/null +++ b/tests/test_gapfilling.py @@ -0,0 +1,109 @@ +"""Tests for connectivity gap-filling (gapfilling/fill.py, Phase 4b).""" +import cobra +import pytest + +from raven_python.gapfilling import GapFillResult, connect_blocked_reactions + + +def _met(mid): + return cobra.Metabolite(mid, name=mid, compartment="c") + + +@pytest.fixture +def draft_and_template(): + """Draft: EX_A -> A -> B (r1), but B has no consumer, so r1 is blocked. + + Template supplies B -> C (r2) and an exchange for C, which unblocks r1. + """ + A, B = _met("A_c"), _met("B_c") + draft = cobra.Model("draft") + exa = cobra.Reaction("EX_A", lower_bound=-10, upper_bound=1000) + exa.add_metabolites({A: 1}) + r1 = cobra.Reaction("r1", lower_bound=0, upper_bound=1000) # A -> B, irreversible + r1.add_metabolites({A: -1, B: 1}) + draft.add_reactions([exa, r1]) + + template = cobra.Model("template") + r2 = cobra.Reaction("r2", lower_bound=0, upper_bound=1000) # B -> C + r2.add_metabolites({_met("B_c"): -1, _met("C_c"): 1}) + exc = cobra.Reaction("EX_C", lower_bound=-1000, upper_bound=1000) + exc.add_metabolites({_met("C_c"): -1}) + extra = cobra.Reaction("r_unneeded", lower_bound=0, upper_bound=1000) # D -> E, irrelevant + extra.add_metabolites({_met("D_c"): -1, _met("E_c"): 1}) + template.add_reactions([r2, exc, extra]) + return draft, template + + +# --------------------------------------------------------------------------- # +# Connectivity gap-fill +# --------------------------------------------------------------------------- # +def test_fill_gaps_connects_blocked_reaction(draft_and_template): + draft, template = draft_and_template + assert "r1" in cobra.flux_analysis.find_blocked_reactions(draft) # precondition + + res = connect_blocked_reactions(draft, template) + assert isinstance(res, GapFillResult) + assert "r1" in res.newly_connected + assert set(res.added_reactions) == {"r2", "EX_C"} # both needed to drain B + assert "r_unneeded" not in res.added_reactions # irrelevant template rxn not added + + +def test_fill_gaps_returns_working_model_that_unblocks(draft_and_template): + draft, template = draft_and_template + res = connect_blocked_reactions(draft, template) + assert {"r2", "EX_C"} <= {r.id for r in res.model.reactions} + assert "r1" not in cobra.flux_analysis.find_blocked_reactions(res.model) + # original draft is untouched + assert "r2" not in {r.id for r in draft.reactions} + + +def test_fill_gaps_nothing_to_do_when_unblocked(draft_and_template): + draft, template = draft_and_template + # give the draft its own drain so r1 is not blocked + drain = cobra.Reaction("EX_B", lower_bound=-1000, upper_bound=1000) + drain.add_metabolites({draft.metabolites.B_c: -1}) + draft.add_reactions([drain]) + res = connect_blocked_reactions(draft, template) + assert res.added_reactions == [] + assert res.newly_connected == [] + + +def test_fill_gaps_scores_prefer_higher_scored_reactions(): + # Two alternative single-reaction drains for B; scores should pick the preferred one. + A, B = _met("A_c"), _met("B_c") + draft = cobra.Model("draft") + exa = cobra.Reaction("EX_A", lower_bound=-10, upper_bound=1000) + exa.add_metabolites({A: 1}) + r1 = cobra.Reaction("r1", lower_bound=0, upper_bound=1000) + r1.add_metabolites({A: -1, B: 1}) + draft.add_reactions([exa, r1]) + template = cobra.Model("t") + d1 = cobra.Reaction("drain1", lower_bound=-1000, upper_bound=1000) + d1.add_metabolites({_met("B_c"): -1}) + d2 = cobra.Reaction("drain2", lower_bound=-1000, upper_bound=1000) + d2.add_metabolites({_met("B_c"): -1}) + template.add_reactions([d1, d2]) + # Scores are penalties (higher = preferred = cheaper to include); only one drain + # is needed, so the less-penalised drain1 is chosen. + res = connect_blocked_reactions(draft, template, scores={"drain1": -1.0, "drain2": -5.0}) + assert res.added_reactions == ["drain1"] + + +def test_unconnectable_reaction_reported_not_added(): + # A blocked irreversible reaction that no template can connect: reported, no adds. + A, B = _met("A_c"), _met("B_c") + draft = cobra.Model("draft") + exa = cobra.Reaction("EX_A", lower_bound=-10, upper_bound=1000) + exa.add_metabolites({A: 1}) + r1 = cobra.Reaction("r1", lower_bound=0, upper_bound=1000) # A -> B, B has no drain + r1.add_metabolites({A: -1, B: 1}) + draft.add_reactions([exa, r1]) + template = cobra.Model("t") # offers nothing that can drain B + noise = cobra.Reaction("noise", lower_bound=0, upper_bound=1000) + noise.add_metabolites({_met("X_c"): -1, _met("Y_c"): 1}) + template.add_reactions([noise]) + + res = connect_blocked_reactions(draft, template) + assert res.added_reactions == [] + assert res.newly_connected == [] + assert "r1" in res.cannot_connect diff --git a/tests/test_init.py b/tests/test_init.py new file mode 100644 index 0000000..a61ee19 --- /dev/null +++ b/tests/test_init.py @@ -0,0 +1,110 @@ +"""Tests for the INIT MILP (init/init.py, Phase 4c).""" +import cobra +import pytest + +from raven_python.init import InitResult, run_init + + +def _met(mid): + return cobra.Metabolite(mid, name=mid[:-2] if mid.endswith("_c") else mid, compartment="c") + + +@pytest.fixture +def model(): + """EX_A -> A -(r1)-> B -(r2)-> C -(r3)-> D, with A uptake and excretion allowed. + + r1, r2 are good (positive score); r3 is bad (negative score). + """ + m = cobra.Model("net") + A, B, C, D = _met("A_c"), _met("B_c"), _met("C_c"), _met("D_c") + m.add_metabolites([A, B, C, D]) + exa = cobra.Reaction("EX_A", lower_bound=-1000, upper_bound=1000) + exa.add_metabolites({A: -1}) # negative flux = uptake of A + r1 = cobra.Reaction("r1", lower_bound=0, upper_bound=1000) + r1.add_metabolites({A: -1, B: 1}) + r2 = cobra.Reaction("r2", lower_bound=0, upper_bound=1000) + r2.add_metabolites({B: -1, C: 1}) + r3 = cobra.Reaction("r3", lower_bound=0, upper_bound=1000) + r3.add_metabolites({C: -1, D: 1}) + m.add_reactions([exa, r1, r2, r3]) + return m + + +def test_keeps_positive_drops_negative(model): + scores = {"r1": 1.0, "r2": 1.0, "r3": -1.0} + res = run_init(model, scores, prod_weight=0.0, allow_excretion=True) + assert isinstance(res, InitResult) + kept = {r.id for r in res.model.reactions} + assert {"r1", "r2"} <= kept # positive-score, flux-consistent -> kept + assert "r3" in res.deleted_reactions # negative score -> removed + assert "r3" not in kept + + +def test_negative_scores_emptied_when_no_reward(model): + # All reactions negative and no production reward -> keep nothing (empty optimum). + scores = {r.id: -1.0 for r in model.reactions} + res = run_init(model, scores, prod_weight=0.0, allow_excretion=True) + assert res.deleted_reactions == sorted(r.id for r in model.reactions) + assert len(res.model.reactions) == 0 + + +def test_essential_reaction_forced_kept(model): + # r3 is negative-scored but essential -> must be kept despite the penalty. + scores = {"r1": 1.0, "r2": 1.0, "r3": -1.0} + res = run_init(model, scores, essential_rxns=["r3"], prod_weight=0.0, allow_excretion=True) + kept = {r.id for r in res.model.reactions} + assert "r3" in kept + assert "r3" not in res.deleted_reactions + + +def test_prod_weight_pulls_in_connectivity(model): + # With everything scored 0, no reward -> empty. With prod_weight>0, producing + # metabolites is rewarded, so flux-carrying reactions are pulled in. + zero = {r.id: 0.0 for r in model.reactions} + empty = run_init(model, zero, prod_weight=0.0, allow_excretion=True) + assert len(empty.model.reactions) == 0 + pulled = run_init(model, zero, prod_weight=0.5, allow_excretion=True) + assert len(pulled.model.reactions) > 0 + + +def test_present_mets_reports_producibility(model): + scores = {"r1": 1.0, "r2": 1.0} + res = run_init( + model, scores, present_mets=["C", "Z"], prod_weight=0.0, allow_excretion=True + ) + assert res.met_production["C"] is True # A->B->C is producible + assert res.met_production["Z"] is False # not in the model + + +def test_objective_returned(model): + res = run_init(model, {"r1": 1.0, "r2": 1.0, "r3": -1.0}, prod_weight=0.0, allow_excretion=True) + assert res.objective == pytest.approx(2.0) # kept r1(+1) + r2(+1), dropped r3 + + +def test_reversible_essential_keeps_productive_path(): + """A reversible essential reaction must not be forced into a phantom fwd+rev loop. + + SRC -> a, R: a <=> b (reversible, essential), SNK: b ->. Forcing R essential + should keep the productive path SRC->R->SNK, not delete SRC/SNK and leave R + self-looping (the bug from forcing eps flux through both split directions). + """ + import cobra + + m = cobra.Model("revess") + a, b = (cobra.Metabolite(x, compartment="c") for x in "ab") + m.add_metabolites([a, b]) + src = cobra.Reaction("SRC", lower_bound=0, upper_bound=1000) + src.add_metabolites({a: 1}) + r = cobra.Reaction("R", lower_bound=-1000, upper_bound=1000) + r.add_metabolites({a: -1, b: 1}) + snk = cobra.Reaction("SNK", lower_bound=0, upper_bound=1000) + snk.add_metabolites({b: -1}) + m.add_reactions([src, r, snk]) + m.objective = "SNK" + + res = run_init(m, {"SRC": -1.0, "SNK": -1.0}, essential_rxns=["R"], prod_weight=0.0) + kept = {rxn.id for rxn in res.model.reactions} + assert "R" in kept + # The productive path must be kept (SRC feeds R, SNK drains it); R can't self-loop. + assert {"SRC", "SNK"} <= kept + assert res.model.slim_optimize() > 1e-6 # the kept model actually carries flux diff --git a/tests/test_init_build.py b/tests/test_init_build.py new file mode 100644 index 0000000..cbc566f --- /dev/null +++ b/tests/test_init_build.py @@ -0,0 +1,132 @@ +"""Tests for tINIT scoring + get_init_model (init/score.py, init/build.py).""" +import math + +import cobra +import pytest + +from raven_python.init import ( + InitModelResult, + gene_scores_from_expression, + get_init_model, + score_reactions_from_genes, +) + + +# --------------------------------------------------------------------------- # +# score_reactions_from_genes +# --------------------------------------------------------------------------- # +@pytest.fixture +def gpr_model(): + m = cobra.Model("g") + a = cobra.Metabolite("a_c", compartment="c") + b = cobra.Metabolite("b_c", compartment="c") + m.add_metabolites([a, b]) + r_complex = cobra.Reaction("r_complex") # (g1 and g2) or g3 + r_complex.add_metabolites({a: -1, b: 1}) + m.add_reactions([r_complex]) + r_complex.gene_reaction_rule = "(g1 and g2) or g3" + r_nogene = cobra.Reaction("r_nogene") + r_nogene.add_metabolites({b: -1}) + m.add_reactions([r_nogene]) + return m + + +def test_score_isozyme_max_complex_min(gpr_model): + # (g1 and g2) or g3 -> max(min(1, 4), 3) = max(1, 3) = 3 + scores = score_reactions_from_genes(gpr_model, {"g1": 1.0, "g2": 4.0, "g3": 3.0}) + assert scores["r_complex"] == 3.0 + + +def test_score_no_gene_reaction_gets_default(gpr_model): + scores = score_reactions_from_genes(gpr_model, {"g1": 1, "g2": 1, "g3": 1}, no_gene_score=-2.0) + assert scores["r_nogene"] == -2.0 + + +def test_score_missing_genes_omitted(gpr_model): + # g2 missing -> complex (g1 and g2) collapses to g1=1; OR with g3=3 -> max(1,3)=3 + scores = score_reactions_from_genes(gpr_model, {"g1": 1.0, "g3": 3.0}) + assert scores["r_complex"] == 3.0 + # all genes missing -> no_gene_score + assert score_reactions_from_genes(gpr_model, {})["r_complex"] == -2.0 + + +def test_score_invalid_method(gpr_model): + with pytest.raises(ValueError, match="isozyme_scoring"): + score_reactions_from_genes(gpr_model, {}, isozyme_scoring="nonsense") + + +# --------------------------------------------------------------------------- # +# gene_scores_from_expression (RNA-seq path) +# --------------------------------------------------------------------------- # +def test_expression_scores_sign_and_clamp(): + expr = {"hi": 100.0, "lo": 1.0, "mid": 10.0, "zero": 0.0} + ref = 10.0 # threshold/reference + s = gene_scores_from_expression(expr, ref) + assert s["hi"] == pytest.approx(min(5 * math.log(10), 10.0)) # above ref -> positive + assert s["lo"] == pytest.approx(max(5 * math.log(0.1), -5.0)) # below ref -> negative + assert s["mid"] == pytest.approx(0.0) # at ref -> 0 + assert s["zero"] == -5.0 # non-positive -> floor + + +def test_expression_per_gene_reference(): + expr = {"g": 20.0} + s = gene_scores_from_expression(expr, {"g": 5.0}) + assert s["g"] == pytest.approx(5 * math.log(4)) + + +# --------------------------------------------------------------------------- # +# get_init_model pipeline +# --------------------------------------------------------------------------- # +@pytest.fixture +def model(): + m = cobra.Model("net") + A, B, C, D = (cobra.Metabolite(x, name=x[:-2], compartment="c") for x in ("A_c", "B_c", "C_c", "D_c")) + m.add_metabolites([A, B, C, D]) + exa = cobra.Reaction("EX_A", lower_bound=-1000, upper_bound=1000) + exa.add_metabolites({A: -1}) + r1 = cobra.Reaction("r1", lower_bound=0, upper_bound=1000) + r1.add_metabolites({A: -1, B: 1}) + r2 = cobra.Reaction("r2", lower_bound=0, upper_bound=1000) + r2.add_metabolites({B: -1, C: 1}) + r3 = cobra.Reaction("r3", lower_bound=0, upper_bound=1000) + r3.add_metabolites({C: -1, D: 1}) + m.add_reactions([exa, r1, r2, r3]) + for r, rule in (("r1", "g1"), ("r2", "g2"), ("r3", "g3")): + m.reactions.get_by_id(r).gene_reaction_rule = rule + return m + + +def test_get_init_model_from_gene_scores(model): + # g1,g2 expressed (positive), g3 not (negative) -> keep r1,r2, drop r3. + res = get_init_model(model, gene_scores={"g1": 5.0, "g2": 5.0, "g3": -5.0}, prod_weight=0.0) + assert isinstance(res, InitModelResult) + kept = {r.id for r in res.model.reactions} + assert {"r1", "r2"} <= kept + assert "r3" not in kept + assert res.reaction_scores["r1"] == 5.0 + + +def test_get_init_model_requires_one_score_source(model): + with pytest.raises(ValueError, match="exactly one"): + get_init_model(model) + with pytest.raises(ValueError, match="exactly one"): + get_init_model(model, rxn_scores={}, gene_scores={}) + + +def test_get_init_model_essential_kept(model): + # r3 negative-scored but essential -> kept. + res = get_init_model( + model, rxn_scores={"r1": 1, "r2": 1, "r3": -1}, essential_rxns=["r3"], prod_weight=0.0 + ) + assert "r3" in {r.id for r in res.model.reactions} + + +def test_get_init_model_removes_dead_ends(model): + # An isolated reaction that can never carry flux is dropped as a dead end. + X, Y = cobra.Metabolite("X_c", compartment="c"), cobra.Metabolite("Y_c", compartment="c") + dead = cobra.Reaction("dead", lower_bound=0, upper_bound=1000) + dead.add_metabolites({X: -1, Y: 1}) # X has no source, Y no sink (no exchange) + model.add_reactions([dead]) + res = get_init_model(model, rxn_scores={"r1": 1, "r2": 1}, prod_weight=0.0) + assert "dead" in res.deleted_dead_end_reactions + assert "dead" not in {r.id for r in res.model.reactions} diff --git a/tests/test_init_ftinit.py b/tests/test_init_ftinit.py new file mode 100644 index 0000000..58f0542 --- /dev/null +++ b/tests/test_init_ftinit.py @@ -0,0 +1,139 @@ +"""Phase 4d.3: the single-step ftINIT MILP (run_ftinit). + +Validated on the testModel oracle against (a) a hand-checked score-optimal solution, +(b) the formulation invariants, and (c) exact agreement with the already-tested +run_init. The full-pipeline RAVEN outputs (tinitTests T0001/T0002) additionally +involve linear merge + the toIgnore masks + staging + exchange re-adding, layered on +in 4d.2/4d.3b/4d.5. + +Note on the toy result: with strict mass balance and no metabolite-production reward +(ftINIT, unlike classic INIT, only rewards metabolomics-detected mets), the +score-optimal subnetwork on testModel is the internal cycle R4→R6→(R10 rev)→(R9 rev), +worth 7+0.5-3+3.5 = 8.0 — it beats the "honest" exchange path because that path must +pay for the negative-score transport reactions R2/R7. The bare INIT MILP has no +loopless constraint (neither does RAVEN's); loop-free models come from the staged +pipeline + exchange handling and, at genome scale, from models having real exchanges +so such cycles are not score-optimal. This faithfully matches RAVEN's MILP. +""" +import cobra +import pytest +from tinit_oracles import TEST_MODEL_SCORES, expr_for_rxn_score, make_test_model + +from raven_python.init import FtInitResult, run_ftinit, run_init +from raven_python.init.score import gene_scores_from_expression, score_reactions_from_genes + +_LOOP = {"R4", "R6", "R9", "R10"} # the score-optimal subnetwork (8.0) + + +def _scores(model): + expr = expr_for_rxn_score(TEST_MODEL_SCORES) + return score_reactions_from_genes(model, gene_scores_from_expression(expr, 1.0)) + + +def test_full_milp_score_optimum(): + model = make_test_model() + res = run_ftinit(model, _scores(model)) + assert isinstance(res, FtInitResult) + assert set(res.kept_reactions) == _LOOP + assert res.deleted_reactions == ["R1", "R2", "R3", "R5", "R7", "R8"] + assert res.objective == pytest.approx(8.0, abs=1e-6) + + +def test_kept_reactions_carry_flux_and_balance(): + """Indicator-on reactions carry flux (≥ force_on) and the solution is steady-state.""" + model = make_test_model() + res = run_ftinit(model, _scores(model)) + for rid in res.kept_reactions: + assert abs(res.fluxes[rid]) > 1e-9 + # The extracted model is itself feasible/flux-consistent. + assert res.model.slim_optimize() is not None + + +def test_agrees_with_run_init(): + """Exact agreement with the classic INIT MILP (no production reward, no rev loops). + + run_init splits reversibles and double-scores both directions unless no_rev_loops, + so we compare under matching settings: same objective and same kept set. + """ + model = make_test_model() + scores = _scores(model) + ft = run_ftinit(model, scores) + init = run_init(model, scores, prod_weight=0.0, eps=0.1, no_rev_loops=True) + assert set(ft.kept_reactions) == {r.id for r in init.model.reactions} + assert ft.objective == pytest.approx(init.objective, abs=1e-6) + + +def test_essential_force_clamps_to_capacity(): + """Forcing an essential reaction is clamped to its capacity (no lb>ub crash). + + A reaction capped at 0.05 forced with the default 0.1 must not error; it is forced + to its capacity (0.05) and the model stays feasible. A per-reaction force of 0.04 + forces exactly that. + """ + m = cobra.Model("cap") + a, b = (cobra.Metabolite(x, compartment="s") for x in "ab") + m.add_metabolites([a, b]) + r = cobra.Reaction("LOW", lower_bound=0, upper_bound=0.05) # tiny capacity + r.add_metabolites({a: -1, b: 1}) + for mid, st in [("EX_a", {a: -1}), ("EX_b", {b: -1})]: + ex = cobra.Reaction(mid, lower_bound=-1000, upper_bound=1000) + ex.add_metabolites(st) + m.add_reactions([ex]) + m.add_reactions([r]) + m.objective = "LOW" + + res = run_ftinit(m, {}, essential_rxns=["LOW"], force_on_ess=0.1) # clamped to 0.05 + assert res.fluxes["LOW"] >= 0.05 - 1e-9 + res2 = run_ftinit(m, {}, essential_rxns=["LOW"], essential_force={"LOW": 0.04}) + assert res2.fluxes["LOW"] >= 0.04 - 1e-9 + + +def test_essential_reaction_forced_on(): + """An essential reaction is kept and carries flux even when its score is negative.""" + model = make_test_model() + res = run_ftinit(model, _scores(model), essential_rxns=["R3"]) + assert "R3" in res.kept_reactions + assert abs(res.fluxes["R3"]) > 1e-6 + + +def test_rem_pos_rev_drops_positive_reversibles(): + """rem_pos_rev frees positive reversibles (score→0): the score-8.0 loop collapses. + + R4 (+7) and R10 (+3.5) are positive reversibles; with them unscored, the cycle is + no longer profitable (R6 0.5 - R9 3 < 0), so nothing scored stays on. + """ + model = make_test_model() + res = run_ftinit(model, _scores(model), rem_pos_rev=True) + assert res.objective == pytest.approx(0.0, abs=1e-6) + assert "R6" not in res.kept_reactions and "R9" not in res.kept_reactions + + +def test_allow_excretion_relaxes_balance(): + """With allow_excretion the result stays feasible (net production permitted).""" + model = make_test_model() + res = run_ftinit(model, _scores(model), allow_excretion=True) + assert res.objective >= 8.0 - 1e-6 # at least as good as strict balance + + +def test_unscored_reactions_are_kept_free(): + """Score-0 reactions are left in the model (not removable), not deleted.""" + model = make_test_model() + scores = _scores(model) + scores["R3"] = 0.0 # make R3 unscored -> must not be deleted + res = run_ftinit(model, scores) + assert "R3" not in res.deleted_reactions + + +def test_forced_flux_lower_bound_is_respected(): + """A scored, non-reversible reaction with lb>0 must keep carrying >= lb flux. + + Guards the bound handling: the single-direction branch must use the model's own + [lb, ub], not zero out a positive lower bound. + """ + model = make_test_model() + scores = _scores(model) + # R6 (2 d[c] => e[c]) is forward-irreversible; force >=2 flux through it. + model.reactions.get_by_id("R6").lower_bound = 2.0 + res = run_ftinit(model, scores) + assert res.fluxes["R6"] >= 2.0 - 1e-6 + assert "R6" not in res.deleted_reactions diff --git a/tests/test_init_genes.py b/tests/test_init_genes.py new file mode 100644 index 0000000..862ca17 --- /dev/null +++ b/tests/test_init_genes.py @@ -0,0 +1,71 @@ +"""Phase 4d.5: remove_low_score_genes — the three RAVEN docstring oracle cases. + +Scores use distinct values to avoid the random tie-break RAVEN mentions when all +isozyme alternatives are negative. +""" +import cobra + +from raven_python.init import remove_low_score_genes + + +def _model(rule: str) -> cobra.Model: + m = cobra.Model("g") + a = cobra.Metabolite("a", compartment="c") + b = cobra.Metabolite("b", compartment="c") + r = cobra.Reaction("R", lower_bound=0, upper_bound=1000) + r.add_metabolites({a: -1, b: 1}) + m.add_reactions([r]) + r.gene_reaction_rule = rule + return m + + +def _norm(rule: str) -> str: + """cobra's normalized form of a GPR string, for order/paren-insensitive comparison.""" + return _model(rule).reactions.R.gene_reaction_rule + + +def _result(rule: str, scores: dict) -> str: + out, _ = remove_low_score_genes(_model(rule), scores) + return out.reactions.R.gene_reaction_rule + + +def test_case1_isozyme_vs_complex(): + """G1 or (G2 and G3 and G4); G1,G2 negative → keep the complex.""" + # G1 more negative than G2 so the complex (= G2's score under min) is least-negative. + scores = {"G1": -2.0, "G2": -1.0, "G3": 1.0, "G4": 1.0} + assert _result("G1 or (G2 and G3 and G4)", scores) == _norm("G2 and G3 and G4") + + +def test_case2_two_complexes(): + """G1 or (G2 and G3) or (G4 and G5); G1,G2 negative → keep the positive complex.""" + scores = {"G1": -1.0, "G2": -1.0, "G3": 1.0, "G4": 1.0, "G5": 1.0} + assert _result("G1 or (G2 and G3) or (G4 and G5)", scores) == _norm("G4 and G5") + + +def test_case3_nested_isozyme_in_complex(): + """(G1 and (G2 or G3) and G4); G2 negative → prune G2 from the inner isozyme group.""" + scores = {"G1": 1.0, "G2": -1.0, "G3": 1.0, "G4": 1.0} + assert _result("G1 and (G2 or G3) and G4", scores) == _norm("G1 and G3 and G4") + + +def test_complex_subunit_not_removed_individually(): + """A negative subunit of a pure complex stays (the whole complex is kept).""" + scores = {"G1": 1.0, "G2": -1.0} + assert _result("G1 and G2", scores) == _norm("G1 and G2") + + +def test_single_negative_gene_kept(): + """A reaction's only gene is never removed (≥1 must remain).""" + assert _result("G1", {"G1": -5.0}) == "G1" + + +def test_unscored_genes_not_removed(): + """Genes absent from the score map are treated as unscored and not removed.""" + scores = {"G1": -1.0} # G2 unscored + assert _result("G1 or G2", scores) == _norm("G2") # only the negative G1 dropped + + +def test_removed_genes_reported_and_pruned(): + out, removed = remove_low_score_genes(_model("G1 or G2"), {"G1": -1.0, "G2": 1.0}) + assert removed == ["G1"] + assert "G1" not in {g.id for g in out.genes} diff --git a/tests/test_init_merge.py b/tests/test_init_merge.py new file mode 100644 index 0000000..f6fea8a --- /dev/null +++ b/tests/test_init_merge.py @@ -0,0 +1,109 @@ +"""Phase 4d.2: linear reaction merging (merge_linear + group_rxn_scores). + +Oracles: RAVEN tinitTests T0004. testModel merges {R1,R2},{R3,R5},{R4,R6},{R7,R8}, +{R9,R10}; testModel4 merges {R5,R6},{R7,R8},{R9,R10} with two reactions flipped. +""" +import pytest +from tinit_oracles import ( + TEST_MODEL4_GROUP_IDS, + TEST_MODEL4_MERGED_REV, + TEST_MODEL4_REVERSED_RXNS, + TEST_MODEL_GROUP_IDS, + TEST_MODEL_GROUPED_SCORES, + TEST_MODEL_MERGED_LB, + TEST_MODEL_MERGED_REV, + TEST_MODEL_SCORES, + make_test_model, + make_test_model4, +) + +from raven_python.init import group_rxn_scores, merge_linear + + +def test_test_model_group_ids(): + _, orig_ids, group_ids, _ = merge_linear(make_test_model()) + assert orig_ids == [f"R{i}" for i in range(1, 11)] + assert group_ids == TEST_MODEL_GROUP_IDS # [1,1,2,3,2,3,4,4,5,5] + + +def test_test_model_reduced_shape(): + reduced, _, _, _ = merge_linear(make_test_model()) + # Five merged reactions, survivors keep the producer's id, original order. + assert [r.id for r in reduced.reactions] == ["R1", "R3", "R4", "R7", "R9"] + assert [int(r.lower_bound < 0) for r in reduced.reactions] == TEST_MODEL_MERGED_REV + assert [r.lower_bound for r in reduced.reactions] == TEST_MODEL_MERGED_LB + + +def test_test_model_grouped_scores(): + reduced, orig_ids, group_ids, _ = merge_linear(make_test_model()) + scores = dict(zip(orig_ids, TEST_MODEL_SCORES, strict=True)) + grouped = group_rxn_scores(reduced, scores, orig_ids, group_ids, + to_zero={"R1", "R2", "R8"}) + got = [grouped[r.id] for r in reduced.reactions] + assert got == pytest.approx(TEST_MODEL_GROUPED_SCORES) # [0,-0.5,7.5,-1,0.5] + + +def test_test_model4_group_ids_and_flips(): + reduced, orig_ids, group_ids, reversed_rxns = merge_linear(make_test_model4()) + assert group_ids == TEST_MODEL4_GROUP_IDS # [0,0,0,0,1,1,2,2,3,3,0] + assert [int(r.lower_bound < 0) for r in reduced.reactions] == TEST_MODEL4_MERGED_REV + flipped = {oid for oid, rev in zip(orig_ids, reversed_rxns, strict=True) if rev} + assert flipped == set(TEST_MODEL4_REVERSED_RXNS) # {R6, R9} + + +def test_merge_preserves_feasible_space(): + """The reduced model admits flux through the merged export path, like the original. + + The reduced model carries no objective (merging drops genes and objective; ftINIT + sets its own from scores), so we set one on the surviving export reaction. R8 + (e[s]=>) was merged into R7 (grp4), so R7 is the reduced export. + """ + original = make_test_model() + assert original.slim_optimize() > 1e-9 # exports e via R8 + reduced, _, _, _ = merge_linear(original) + reduced.objective = "R7" + assert reduced.slim_optimize() > 1e-9 + + +def test_no_merge_blocks_merging(): + """A reaction in no_merge keeps its own group (id 0) and is not contracted.""" + _, orig_ids, group_ids, _ = merge_linear(make_test_model(), no_merge=["R2"]) + g = dict(zip(orig_ids, group_ids, strict=True)) + assert g["R2"] == 0 # R2 never merged + # R1 was only mergeable with R2, so it stays unmerged too. + assert g["R1"] == 0 + + +def test_multipass_chain_collapses_to_one_group(): + """A 3-reaction chain A→X→Y→Z collapses to one reaction (exercises multi-pass). + + X is degree-2 (r1,r2), Y degree-2 (r2,r3); A and Z are degree-1 (retained). Merging + X makes Y newly degree-2 with the survivor, caught on a later pass. Confluence: all + three reactions end in one group, leaving the net A→Z reaction. + """ + import cobra + + m = cobra.Model("chain") + A, X, Y, Z = (cobra.Metabolite(i, name=i, compartment="c") for i in "AXYZ") + m.add_metabolites([A, X, Y, Z]) + for rid, stoich in [("r1", {A: -1, X: 1}), ("r2", {X: -1, Y: 1}), ("r3", {Y: -1, Z: 1})]: + r = cobra.Reaction(rid, lower_bound=0, upper_bound=1000) + r.add_metabolites(stoich) + m.add_reactions([r]) + + reduced, orig_ids, group_ids, _ = merge_linear(m) + assert len(reduced.reactions) == 1 # collapsed to net A -> Z + assert len(set(group_ids)) == 1 and group_ids[0] != 0 # all three in one group + only = reduced.reactions[0] + assert {mt.id: c for mt, c in only.metabolites.items()} == {"A": -1.0, "Z": 1.0} + + +def test_group_scores_zero_handling(): + """Genuine-zero score → 0.01; a group cancelling to zero with nonzero members → 0.01.""" + reduced, orig_ids, group_ids, _ = merge_linear(make_test_model()) + # Give group {R3,R5} scores that cancel: R3=+1, R5=-1 -> sum 0 but members nonzero. + scores = dict.fromkeys(orig_ids, 0.0) + scores["R3"], scores["R5"] = 1.0, -1.0 + grouped = group_rxn_scores(reduced, scores, orig_ids, group_ids) + assert grouped["R3"] == pytest.approx(0.01) # cancelled group rescued + assert grouped["R4"] == pytest.approx(0.02) # {R4,R6} both genuine-0 → 0.01+0.01 diff --git a/tests/test_init_oracles.py b/tests/test_init_oracles.py new file mode 100644 index 0000000..3f3e52d --- /dev/null +++ b/tests/test_init_oracles.py @@ -0,0 +1,64 @@ +"""Validate the ftINIT toy oracles and that our scoring reproduces RAVEN's. + +This is Phase 4d.0: the correctness scaffold. The (ft)INIT MILP itself is not yet +ported, so the on/off-output oracles in tinit_oracles live there as constants for the +later sub-phases; here we lock down the pieces that already exist — the score→ +expression inversion and scoreComplexModel-equivalent scoring (RAVEN tinitTests +T0009). +""" +import pytest +from tinit_oracles import ( + TEST_MODEL4_SCORES, + TEST_MODEL5_SCORES, + TEST_MODEL_SCORES, + expr_for_rxn_score, + make_test_model, + make_test_model4, + make_test_model5, +) + +from raven_python.init.score import gene_scores_from_expression, score_reactions_from_genes + + +@pytest.mark.parametrize( + "make_model, scores", + [ + (make_test_model, TEST_MODEL_SCORES), + (make_test_model4, TEST_MODEL4_SCORES), + (make_test_model5, TEST_MODEL5_SCORES), + ], +) +def test_scoring_reproduces_defined_scores(make_model, scores): + """RAVEN T0009: expr_for_rxn_score → scoreComplexModel round-trips the scores.""" + model = make_model() + expression = expr_for_rxn_score(scores) + gene_scores = gene_scores_from_expression(expression, 1.0) + rxn_scores = score_reactions_from_genes(model, gene_scores) + got = [rxn_scores[r.id] for r in model.reactions] + assert got == pytest.approx(scores, abs=1e-10) + + +def test_expr_for_rxn_score_inverts_scoring(): + """level = exp(score/5); 5·ln(level/1) recovers the score.""" + scores = [-5, -1, 0.5, 7, 10] + expr = expr_for_rxn_score(scores) + recovered = gene_scores_from_expression(expr, 1.0) + assert [recovered[f"G{i + 1}"] for i in range(len(scores))] == pytest.approx(scores) + + +def test_test_model_structure(): + """Sanity: shapes, no-GPR reactions, reversibility, objective.""" + m = make_test_model() + assert len(m.reactions) == 10 and len(m.metabolites) == 8 + no_gpr = {r.id for r in m.reactions if not r.genes} + assert no_gpr == {"R1", "R2", "R8"} # the reactions scored -2 (no gene) + rev = {r.id for r in m.reactions if r.lower_bound < 0} + assert rev == {"R2", "R3", "R4", "R9", "R10"} + assert m.objective.expression.as_coefficients_dict() # objective set (R8) + + +def test_test_model_is_feasible_for_the_task(): + """The toy model can actually make e[s] from a[s] (so the task oracle is meaningful).""" + m = make_test_model() + m.objective = "R8" + assert m.slim_optimize() > 1e-6 diff --git a/tests/test_init_pipeline.py b/tests/test_init_pipeline.py new file mode 100644 index 0000000..bf2a2ac --- /dev/null +++ b/tests/test_init_pipeline.py @@ -0,0 +1,161 @@ +"""Phase 4d.3b: the staged ftINIT pipeline (prep_init_model + get_init_steps + ftinit). + +Oracles: RAVEN tinitTests T0001/T0002 on testModel with the default '1+1' schedule. +""" + +from tinit_oracles import ( + TEST_MODEL_FTINIT_NO_TASKS, + TEST_MODEL_FTINIT_SPONT_R7_R10, + TEST_MODEL_FTINIT_WITH_TASK, + TEST_MODEL_SCORES, + TEST_MODEL_TASK_ESSENTIAL_MERGED, + expr_for_rxn_score, + make_test_model, + make_test_task, +) + +from raven_python.init import ( + classify_reactions, + ftinit, + get_init_steps, + prep_init_model, + score_reactions_from_genes, +) +from raven_python.init.score import gene_scores_from_expression + + +def _scores(model): + return score_reactions_from_genes( + model, gene_scores_from_expression(expr_for_rxn_score(TEST_MODEL_SCORES), 1.0) + ) + + +# --------------------------------------------------------------------------- # +# classify_reactions (the toIgnore masks) — tinitTests T0001 mask oracle. +# --------------------------------------------------------------------------- # +def test_classify_exchange_and_transport(): + masks = classify_reactions(make_test_model(), ext_comp="s") + assert masks.exchange == {"R1", "R8"} # boundary reactions + assert masks.import_rxns == {"R2"} # a[s] <=> a[c], no GPR, into ext comp + assert masks.no_gpr == {"R1", "R2", "R8"} + assert "R7" not in masks.import_rxns # R7 has a GPR -> not a transport category + + +def test_classify_spontaneous(): + masks = classify_reactions(make_test_model(), ext_comp="s", spontaneous=["R7", "R10"]) + assert masks.exchange | masks.spontaneous == {"R1", "R7", "R8", "R10"} + + +def test_get_init_steps_default(): + steps = get_init_steps("1+1") + assert len(steps) == 2 + assert steps[0].how_to_use_prev == "ignore" + assert steps[0].ignore_mask == (1, 1, 1, 1, 1, 1, 1, 0) + assert steps[1].how_to_use_prev == "essential" + assert steps[1].ignore_mask == (1, 0, 0, 0, 1, 0, 0, 0) + assert len(get_init_steps("full")) == 1 + + +# --------------------------------------------------------------------------- # +# Full '1+1' pipeline — T0001 (no tasks) and T0002 (with task). +# --------------------------------------------------------------------------- # +def test_ftinit_no_tasks_matches_oracle(): + """T0001: testModel, no tasks, '1+1' → {R1,R4,R6,R8,R9,R10}.""" + model = make_test_model() + prep = prep_init_model(model, ext_comp="s") + out = ftinit(prep, _scores(model)) + assert {r.id for r in out.reactions} == set(TEST_MODEL_FTINIT_NO_TASKS) + + +def test_ftinit_with_spontaneous_matches_oracle(): + """T0001 variant: R7,R10 spontaneous → the path through R2/R7, {R1,R2,R4,R6,R7,R8}.""" + model = make_test_model() + prep = prep_init_model(model, ext_comp="s", spontaneous=["R7", "R10"]) + out = ftinit(prep, _scores(model)) + assert {r.id for r in out.reactions} == set(TEST_MODEL_FTINIT_SPONT_R7_R10) + + +def test_ftinit_with_task_matches_oracle(): + """T0002: task 'make e[s] from a[s]' makes R2,R7 essential → {R1,R2,R4,R6,R7,R8,R9,R10}.""" + model = make_test_model() + prep = prep_init_model(model, [make_test_task()], ext_comp="s") + # Essentials map to merged ids {R1, R7} (RAVEN T0002). + assert prep.essential_rxns == set(TEST_MODEL_TASK_ESSENTIAL_MERGED) + out = ftinit(prep, _scores(model)) + assert {r.id for r in out.reactions} == set(TEST_MODEL_FTINIT_WITH_TASK) + + +def test_full_series_runs(): + """The single-step 'full' series also produces a feasible subnetwork.""" + model = make_test_model() + prep = prep_init_model(model, ext_comp="s") + out = ftinit(prep, _scores(model), series="full") + assert len(out.reactions) >= 1 + + +def test_pipeline_with_gene_scores_and_tasks_wires_up(): + """ftinit accepts gene_scores (gene pruning) + tasks (gap-fill) without breaking T0002. + + The toy's GPRs are single-gene (nothing to prune) and the task is feasible in the + extracted model (nothing to gap-fill), so the reaction set is unchanged — this + confirms the integration wiring (the pruning/gap-fill logic is unit-tested + separately in test_init_genes / test_init_taskfill). + """ + model = make_test_model() + gene_scores = gene_scores_from_expression(expr_for_rxn_score(TEST_MODEL_SCORES), 1.0) + prep = prep_init_model(model, [make_test_task()], ext_comp="s") + out = ftinit(prep, _scores(model), gene_scores=gene_scores) + assert {r.id for r in out.reactions} == set(TEST_MODEL_FTINIT_WITH_TASK) + + +def test_orient_forward_reverses_a_reversible_reaction(): + """_orient_forward(rxn, -1) flips stoichiometry and makes it irreversible forward.""" + import cobra + + from raven_python.init.prep import _orient_forward + + m = cobra.Model("o") + a, b = (cobra.Metabolite(x, compartment="s") for x in "ab") + m.add_metabolites([a, b]) + r = cobra.Reaction("R", lower_bound=-800, upper_bound=1000) + r.add_metabolites({a: -1, b: 2}) # a <=> 2 b + m.add_reactions([r]) + + _orient_forward(r, -1) # forced reverse → becomes forward + assert r.bounds == (0, 800) # [-800,1000] → flip [-1000,800] → lb→0 + assert {mt.id: c for mt, c in r.metabolites.items()} == {"a": 1, "b": -2} # 2 b => a + + fwd = cobra.Reaction("F", lower_bound=-500, upper_bound=900) + fwd.add_metabolites({a: -1}) + m.add_reactions([fwd]) + _orient_forward(fwd, 1) # forced forward → just made irreversible + assert fwd.bounds == (0, 900) + + +def test_essential_merged_away_is_skipped(): + """An essential reaction whose merge group collapses away imposes no constraint. + + REV sits between two exchanges, so it merges with them into a trivial source→sink + that is removed; its group has no survivor. prep_init_model must skip it, not crash. + """ + import cobra + + from raven_python.tasks import Task + + m = cobra.Model("collapse") + a, b = (cobra.Metabolite(x, name=x, compartment="s") for x in "ab") + m.add_metabolites([a, b]) + r = cobra.Reaction("REV", lower_bound=-1000, upper_bound=1000) + r.add_metabolites({a: -1, b: 1}) + r.gene_reaction_rule = "g1" + exchanges = [] + for met in (a, b): + ex = cobra.Reaction(f"EX_{met.id}", lower_bound=-1000, upper_bound=1000) + ex.add_metabolites({met: -1}) + exchanges.append(ex) + m.add_reactions([r, *exchanges]) + m.objective = "REV" + task = Task(id="mk_a", inputs=[("b[s]", 0.0, 1000.0)], outputs=[("a[s]", 1.0, 1.0)]) + + prep = prep_init_model(m, [task], ext_comp="s") # must not raise + assert "REV" not in prep.essential_rxns # merged into a collapsed group diff --git a/tests/test_init_solvers.py b/tests/test_init_solvers.py new file mode 100644 index 0000000..514c408 --- /dev/null +++ b/tests/test_init_solvers.py @@ -0,0 +1,149 @@ +"""Cross-solver smoke tests for the (f)tINIT MILP path. + +The clean-data calibration and robustness studies were run on Gurobi; the tractability +choices (big-M=100, MIP gap, time limits) and the Gurobi-specific param plumbing +(``opt.problem.Params.MIPGap``) only matter if those choices also work on the *other* +MILP backends real users have. These tests assert that each available MILP-capable +optlang interface produces the same reaction-set verdict as Gurobi on the toy models the +unit tests use — so a regression in solver portability fails CI instead of being found +months later on a user's machine. + +Solvers tested: every MILP-capable cobra/optlang interface that imports in this env +(Gurobi, HiGHS via ``hybrid``, GLPK). Missing ones are skipped automatically. Genome-scale +behaviour is measured separately by ``scripts/analyze_init_solvers.py`` (manual benchmark). +""" +from __future__ import annotations + +import importlib + +import cobra +import pytest + +from raven_python.init import ftinit, prep_init_model, run_ftinit, run_init +from raven_python.tasks import Task, check_tasks + +# Detect which MILP-capable optlang interfaces actually work; skip the rest. +# We do a real import (not just find_spec) because optlang ships every backend's +# module file but those that wrap third-party solvers (gurobi, cplex) only import +# cleanly when the underlying solver is installed — find_spec would say "present" +# and then we'd crash at fixture time on CI runners without Gurobi. +_INTERFACES = {"gurobi": "gurobi_interface", "hybrid": "hybrid_interface", "glpk": "glpk_interface"} + + +def _solver_available(modname: str) -> bool: + try: + importlib.import_module(f"optlang.{modname}") + return True + except ImportError: + return False + + +_AVAILABLE = [name for name, mod in _INTERFACES.items() if _solver_available(mod)] + +# Known upstream blocker: ``optlang.hybrid_interface.Configuration.clone()`` rejects +# ``lp_method='primal'``. Marked strict so this flips red when optlang is fixed and +# we should drop the marker. See docs/init_solver_benchmark.md. +_XFAIL = {"hybrid": pytest.mark.xfail( + reason="optlang hybrid_interface.Configuration rejects lp_method='primal' (upstream)", + strict=True, raises=ValueError, +)} + + +def _param(name: str): + marks = [_XFAIL[name]] if name in _XFAIL else [] + return pytest.param(name, marks=marks, id=name) + + +@pytest.fixture(params=[_param(n) for n in _AVAILABLE]) +def solver(request): + """One installed MILP solver per parameter value.""" + return request.param + + +# ----------------------------------------------------------------------- toy fixtures + +def _met(mid, comp="c"): + return cobra.Metabolite(mid, name=mid.split("_")[0], compartment=comp) + + +def _toy_init_model() -> cobra.Model: + """EX_A → A → B → C → D (r1, r2 good; r3 bad). Same network as test_init.py.""" + def rxn(rid, lb, ub, mets): + r = cobra.Reaction(rid, lower_bound=lb, upper_bound=ub) + r.add_metabolites(mets) + return r + m = cobra.Model("toy") + A, B, C, D = (_met(x) for x in ("A_c", "B_c", "C_c", "D_c")) + m.add_metabolites([A, B, C, D]) + m.add_reactions([rxn("EX_A", -1000, 1000, {A: -1}), + rxn("r1", 0, 1000, {A: -1, B: 1}), + rxn("r2", 0, 1000, {B: -1, C: 1}), + rxn("r3", 0, 1000, {C: -1, D: 1})]) + return m + + +def _toy_ftinit_model() -> cobra.Model: + """Small flux-consistent network for ftINIT: A→B, B→C, parallel A→C (negative-score).""" + def rxn(rid, lb, ub, mets): + r = cobra.Reaction(rid, lower_bound=lb, upper_bound=ub) + r.add_metabolites(mets) + return r + m = cobra.Model("ftoy") + A, B, C = (_met(x) for x in ("A_c", "B_c", "C_c")) + m.add_metabolites([A, B, C]) + m.add_reactions([rxn("EX_A", -1000, 0, {A: -1}), + rxn("EX_C", 0, 1000, {C: -1}), + rxn("r1", 0, 1000, {A: -1, B: 1}), + rxn("r2", 0, 1000, {B: -1, C: 1}), + rxn("rbad", 0, 1000, {A: -1, C: 1})]) + return m + + +# --------------------------------------------------------------------- tests + +def test_run_init_same_verdict(solver): + """tINIT MILP on a small network drops the negative-score reaction with any solver.""" + m = _toy_init_model() + m.solver = solver + res = run_init(m, {"r1": 1.0, "r2": 1.0, "r3": -1.0}, prod_weight=0.0, allow_excretion=True) + assert "r3" in res.deleted_reactions + assert sorted(set(r.id for r in res.model.reactions)) == ["EX_A", "r1", "r2"] + + +def test_run_ftinit_same_verdict(solver): + """ftINIT MILP picks the same on-set across solvers on a small network.""" + m = _toy_ftinit_model() + m.solver = solver + res = run_ftinit(m, {"r1": 1.0, "r2": 1.0, "rbad": -1.0}, allow_excretion=True) + assert "rbad" not in res.on_reactions + assert {"r1", "r2"}.issubset(res.on_reactions) + + +def test_check_tasks_works_per_solver(solver): + """check_tasks (one slim_optimize per task) works with each solver.""" + m = _toy_ftinit_model() + m.solver = solver + task = Task(id="make_c", inputs=[("A[c]", 0.0, 1000.0)], outputs=[("C[c]", 1.0, 1.0)]) + results = check_tasks(m, [task]) + assert results[0].passed + + +def test_ftinit_pipeline_with_tasks(solver): + """The full ftinit() pipeline (prep + staged MILP + gap-fill) runs with each solver.""" + m = _toy_ftinit_model() + m.solver = solver + task = Task(id="make_c", inputs=[("A[c]", 0.0, 1000.0)], outputs=[("C[c]", 1.0, 1.0)]) + prep = prep_init_model(m, [task]) + out = ftinit(prep, {"r1": 1.0, "r2": 1.0, "rbad": -1.0}, series="1+1") + # Functional: the target task remains satisfiable in the extracted model. + assert check_tasks(out, [task])[0].passed + + +def test_solver_param_plumbing(solver): + """mip_gap / time_limit reach the solver without raising (graceful per backend).""" + m = _toy_ftinit_model() + m.solver = solver + # Tight time limit + loose gap on a trivial problem; just verify the call returns. + res = run_ftinit(m, {"r1": 1.0, "rbad": -1.0}, allow_excretion=True, + mip_gap=0.05, time_limit=60) + assert res.objective is not None diff --git a/tests/test_init_taskfill.py b/tests/test_init_taskfill.py new file mode 100644 index 0000000..c975f41 --- /dev/null +++ b/tests/test_init_taskfill.py @@ -0,0 +1,83 @@ +"""Phase 4d.4: task gap-filling (fill_tasks). + +Oracle: RAVEN tinitTests T0003. Remove the exchange reactions and create a gap by +deleting R7 (e[c] -> e[s]); gap-filling against the full reference must add R7 back so +the task 'make e[s] from a[s]' becomes feasible again. +""" +from tinit_oracles import make_test_model, make_test_task + +from raven_python.init import TaskFillResult, fill_tasks + + +def _reference_without_exchanges(): + """testModel with the exchange reactions (R1, R8) removed — the gap-fill template.""" + ref = make_test_model() + ref.remove_reactions(["R1", "R8"], remove_orphans=False) + return ref + + +def test_fills_the_gap_with_r7(): + ref = _reference_without_exchanges() + gapped = ref.copy() + gapped.remove_reactions(["R7"], remove_orphans=False) # the gap + res = fill_tasks(gapped, ref, [make_test_task()]) + assert isinstance(res, TaskFillResult) + assert res.added_reactions == ["R7"] + assert "R7" in {r.id for r in res.model.reactions} + assert not res.failed_tasks + + +def test_no_fill_when_already_feasible(): + """A model that can already do the task gets no additions.""" + ref = _reference_without_exchanges() + res = fill_tasks(ref.copy(), ref, [make_test_task()]) + assert res.added_reactions == [] + + +def test_should_fail_tasks_ignored(): + from raven_python.tasks import Task + + ref = _reference_without_exchanges() + gapped = ref.copy() + gapped.remove_reactions(["R7"], remove_orphans=False) + sf = Task(id="sf", should_fail=True, outputs=[("e[s]", 1.0, 1.0)]) + res = fill_tasks(gapped, ref, [sf]) + assert res.added_reactions == [] # should_fail task drives no gap-filling + + +def test_open_exchange_does_not_short_circuit_gapfill(): + """Boundaries are closed during gap-filling, so an open exchange can't fake feasibility. + + Give the gapped model an open exchange on e[s]; without closing boundaries the task + 'produce e[s]' would look feasible (free secretion) and R7 would never be added. + """ + import cobra + + ref = _reference_without_exchanges() + gapped = ref.copy() + gapped.remove_reactions(["R7"], remove_orphans=False) + ex_es = cobra.Reaction("EX_es", lower_bound=-1000, upper_bound=1000) + ex_es.add_metabolites({gapped.metabolites.es: -1}) + gapped.add_reactions([ex_es]) # open exchange that must be ignored + res = fill_tasks(gapped, ref, [make_test_task()]) + assert "R7" in res.added_reactions # gap still detected and filled + + +def test_prefers_cheaper_reactions_by_score(): + """When two candidates can fill a gap, the higher-scored (cheaper) one is chosen. + + Build a gap that R7 (e[c]->e[s]) OR an alternative ALT (e[c]->e[s]) can fill; give + ALT a much better score so it is preferred. + """ + import cobra + + ref = _reference_without_exchanges() + alt = cobra.Reaction("ALT", lower_bound=0, upper_bound=1000) + alt.add_metabolites({ref.metabolites.ec: -1, ref.metabolites.es: 1}) # same as R7 + alt.gene_reaction_rule = "gALT" + ref.add_reactions([alt]) + gapped = ref.copy() + gapped.remove_reactions(["R7", "ALT"], remove_orphans=False) + # ALT scored high (cost low), R7 scored low (cost high) → ALT chosen. + res = fill_tasks(gapped, ref, [make_test_task()], rxn_scores={"ALT": 5.0, "R7": -3.0}) + assert res.added_reactions == ["ALT"] diff --git a/tests/test_io_excel.py b/tests/test_io_excel.py new file mode 100644 index 0000000..12434ef --- /dev/null +++ b/tests/test_io_excel.py @@ -0,0 +1,111 @@ +"""Tests for raven_python.io.excel (exportToExcelFormat port, export only).""" +import cobra +import pytest + +openpyxl = pytest.importorskip("openpyxl") + +from raven_python.io import export_to_excel +from raven_python.manipulation import add_reactions_from_equations + + +@pytest.fixture +def model(): + m = cobra.Model("yeastGEM") + m.name = "Yeast" + m.compartments = {"c": "cytoplasm"} + m.notes["metaData"] = {"taxonomy": "taxonomy/559292", "defaultLB": "-1000"} + m.add_metabolites( + [ + cobra.Metabolite("atp_c", name="ATP", formula="C10H16N5O13P3", charge=-4, compartment="c"), + cobra.Metabolite("adp_c", name="ADP", compartment="c"), + ] + ) + m.metabolites.atp_c.annotation = {"kegg.compound": ["C00002"], "smiles": ["C1=NC"]} + m.metabolites.atp_c.notes = {"inchis": "InChI=1S/X"} + add_reactions_from_equations( + m, + [{"id": "R1", "equation": "atp_c <=> adp_c", "name": "rxn one", + "gene_reaction_rule": "G1", "subsystem": "glycolysis"}], + ) + r = m.reactions.R1 + r.annotation = {"ec-code": ["1.1.1.1"], "kegg.reaction": ["R00001"]} + r.notes = {"confidence_score": 2, "note": "a note", "references": "PMID:1"} + r.objective_coefficient = 1 + return m + + +def _wb(path): + return openpyxl.load_workbook(path) + + +def test_sheets_present(model, tmp_path): + out = tmp_path / "m.xlsx" + export_to_excel(model, out) + wb = _wb(out) + assert set(wb.sheetnames) == {"RXNS", "METS", "COMPS", "GENES", "MODEL"} + + +def test_rxns_sheet(model, tmp_path): + out = tmp_path / "m.xlsx" + export_to_excel(model, out) + ws = _wb(out)["RXNS"] + header = [c.value for c in ws[1]] + row = {header[i]: c.value for i, c in enumerate(ws[2])} + assert row["ID"] == "R1" + assert row["NAME"] == "rxn one" + assert "ATP[c]" in row["EQUATION"] and "<=>" in row["EQUATION"] + assert row["EC-NUMBER"] == "1.1.1.1" + assert row["GENE ASSOCIATION"] == "G1" + assert row["SUBSYSTEM"] == "glycolysis" + assert row["OBJECTIVE"] == 1 + assert row["CONFIDENCE SCORE"] == 2 + assert row["NOTE"] == "a note" + assert row["MIRIAM"] == "kegg.reaction/R00001" # ec-code excluded (own column) + + +def test_mets_sheet(model, tmp_path): + out = tmp_path / "m.xlsx" + export_to_excel(model, out) + ws = _wb(out)["METS"] + header = [c.value for c in ws[1]] + rows = { + r[header.index("REPLACEMENT ID")].value: {header[i]: c.value for i, c in enumerate(r)} + for r in ws.iter_rows(min_row=2) + } + atp = rows["atp_c"] + assert atp["ID"] == "ATP[c]" + assert atp["NAME"] == "ATP" + assert atp["InChI"] == "InChI=1S/X" + assert atp["COMPOSITION"] is None # suppressed when InChI present + assert atp["CHARGE"] == -4 + assert atp["MIRIAM"] == "kegg.compound/C00002" # smiles excluded + + +def test_model_sheet(model, tmp_path): + out = tmp_path / "m.xlsx" + export_to_excel(model, out) + ws = _wb(out)["MODEL"] + header = [c.value for c in ws[1]] + row = {header[i]: c.value for i, c in enumerate(ws[2])} + assert row["ID"] == "yeastGEM" + assert row["NAME"] == "Yeast" + assert row["TAXONOMY"] == "taxonomy/559292" + assert row["DEFAULT LOWER"] == "-1000" + + +def test_genes_sheet(model, tmp_path): + out = tmp_path / "m.xlsx" + export_to_excel(model, out) + ws = _wb(out)["GENES"] + header = [c.value for c in ws[1]] + row = {header[i]: c.value for i, c in enumerate(ws[2])} + assert row["NAME"] == "G1" + + +def test_no_genes_skips_sheet(tmp_path): + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite("a_c", compartment="c")]) + add_reactions_from_equations(m, [{"id": "R1", "equation": "a_c -->"}]) + out = tmp_path / "m.xlsx" + export_to_excel(m, out) + assert "GENES" not in _wb(out).sheetnames diff --git a/tests/test_io_git.py b/tests/test_io_git.py new file mode 100644 index 0000000..28881dc --- /dev/null +++ b/tests/test_io_git.py @@ -0,0 +1,69 @@ +"""Tests for raven_python.io.git (exportForGit port).""" +import cobra +import pytest + +from raven_python.io import export_for_git +from raven_python.manipulation import add_reactions_from_equations + + +@pytest.fixture +def model(): + m = cobra.Model("yeastGEM") + m.compartments = {"c": "cytoplasm"} + m.add_metabolites( + [cobra.Metabolite("atp_c", name="ATP", compartment="c"), + cobra.Metabolite("adp_c", name="ADP", compartment="c")] + ) + add_reactions_from_equations(m, [{"id": "R1", "equation": "atp_c <=> adp_c"}]) + return m + + +def test_standard_gem_layout(model, tmp_path): + root = export_for_git(model, tmp_path, prefix="yeast", formats=("yml", "xml", "mat", "xlsx", "txt")) + assert root == tmp_path / "model" + assert (root / "yml" / "yeast.yml").exists() + assert (root / "xml" / "yeast.xml").exists() + assert (root / "mat" / "yeast.mat").exists() + assert (root / "xlsx" / "yeast.xlsx").exists() + assert (root / "txt" / "yeast.txt").exists() + assert (root / "dependencies.txt").exists() + + +def test_dependencies_file(model, tmp_path): + root = export_for_git(model, tmp_path, formats=("yml",)) + deps = (root / "dependencies.txt").read_text() + assert "python\t" in deps + assert "cobra\t" in deps + assert "raven_python\t" in deps + + +def test_flat_layout(model, tmp_path): + root = export_for_git(model, tmp_path, formats=("yml",), sub_dirs=False) + assert root == tmp_path + assert (tmp_path / "model.yml").exists() + + +def test_subset_of_formats(model, tmp_path): + root = export_for_git(model, tmp_path, formats=("yml", "xml")) + assert (root / "yml" / "model.yml").exists() + assert not (root / "mat").exists() + assert not (root / "xlsx").exists() + + +def test_does_not_mutate_model(model, tmp_path): + order_before = [r.id for r in model.reactions] + export_for_git(model, tmp_path, formats=("yml",)) + assert [r.id for r in model.reactions] == order_before + + +def test_txt_table_content(model, tmp_path): + root = export_for_git(model, tmp_path, formats=("txt",)) + txt = (root / "txt" / "model.txt").read_text() + assert txt.splitlines()[0].startswith("Rxn name\t") + assert "R1" in txt + assert "ATP[c]" in txt + + +def test_bad_format(model, tmp_path): + with pytest.raises(ValueError, match="Unknown format"): + export_for_git(model, tmp_path, formats=("yml", "json")) diff --git a/tests/test_io_sif.py b/tests/test_io_sif.py new file mode 100644 index 0000000..d50ad98 --- /dev/null +++ b/tests/test_io_sif.py @@ -0,0 +1,82 @@ +"""Tests for raven_python.io.sif (exportModelToSIF port).""" +import cobra +import pytest + +from raven_python.io import export_model_to_sif +from raven_python.manipulation import add_reactions_from_equations + + +@pytest.fixture +def model(): + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b", "c")]) + add_reactions_from_equations( + m, + [ + {"id": "R1", "equation": "a --> b"}, + {"id": "R2", "equation": "b --> c"}, + ], + ) + return m + + +def _lines(path): + return [ln.split("\t") for ln in path.read_text().splitlines()] + + +def test_reaction_compound(model, tmp_path): + out = tmp_path / "g.sif" + export_model_to_sif(model, out, "rc") + rows = {r[0]: (r[1], set(r[2:])) for r in _lines(out)} + assert rows["R1"] == ("rc", {"a", "b"}) + assert rows["R2"] == ("rc", {"b", "c"}) + + +def test_reaction_reaction(model, tmp_path): + out = tmp_path / "g.sif" + export_model_to_sif(model, out, "rr") + rows = {r[0]: set(r[2:]) for r in _lines(out)} + # R1 and R2 share metabolite b + assert rows["R1"] == {"R2"} + assert rows["R2"] == {"R1"} + + +def test_compound_compound(model, tmp_path): + out = tmp_path / "g.sif" + export_model_to_sif(model, out, "cc") + rows = {r[0]: set(r[2:]) for r in _lines(out)} + # a is a substrate of R1 (a->b): a links to product b + assert "b" in rows.get("a", set()) + # b is substrate of R2 (b->c): b links to c + assert "c" in rows.get("b", set()) + + +def test_custom_labels(model, tmp_path): + out = tmp_path / "g.sif" + export_model_to_sif(model, out, "rc", reaction_labels={"R1": "Reaction1"}) + sources = {r[0] for r in _lines(out)} + assert "Reaction1" in sources + assert "R1" not in sources + + +def test_bad_graph_type(model, tmp_path): + with pytest.raises(ValueError, match="graph_type"): + export_model_to_sif(model, tmp_path / "g.sif", "xx") + + +def test_cc_does_not_mutate_input(model, tmp_path): + n_before = len(model.reactions) + export_model_to_sif(model, tmp_path / "g.sif", "cc") + assert len(model.reactions) == n_before # convert_to_irreversible ran on a copy + + +# --- regression: label-map collision (known_issues.md B4) ------------------ + +def test_collapsing_label_map_warns(model, tmp_path): + """A label map that sends two distinct ids to the same label silently merges + nodes during the target-side dedup. Now warns so the user sees it.""" + with pytest.warns(UserWarning, match="multiple ids to the same label"): + export_model_to_sif( + model, tmp_path / "g.sif", "rc", + reaction_labels={"R1": "shared", "R2": "shared"}, + ) diff --git a/tests/test_io_yaml.py b/tests/test_io_yaml.py new file mode 100644 index 0000000..510af5f --- /dev/null +++ b/tests/test_io_yaml.py @@ -0,0 +1,202 @@ +"""Tests for raven_python.io.yaml against the RAVEN fa281a1 (cobra-native !!omap) schema.""" +from pathlib import Path + +import cobra +import pytest +from cobra.io.yaml import yaml as cobra_yaml + +from raven_python.io import read_yaml_model, write_yaml_model + +# A model laid out exactly as RAVEN writeYAMLmodel (fa281a1) emits: cobra-native +# structure, RAVEN-only fields as top-level per-entry keys, smiles/ec-code inside +# the annotation block, metaData provenance-only, id/name/version top-level. +RAVEN_DOC = { + "metabolites": [ + { + "id": "s_0001", + "name": "ATP", + "compartment": "c", + "formula": "C10H16N5O13P3", + "charge": -4, + "inchis": "InChI=1S/CH4", + "deltaG": 12.5, + "notes": "a metabolite note", + "metFrom": "KEGG", + "annotation": {"kegg.compound": ["C00002"], "smiles": ["C1=NC2"]}, + }, + {"id": "s_0002", "name": "ADP", "compartment": "c"}, + ], + "reactions": [ + { + "id": "R1", + "name": "rxn one", + "metabolites": {"s_0001": -1, "s_0002": 1}, + "lower_bound": -1000.0, + "upper_bound": 1000.0, + "gene_reaction_rule": "G1", + "subsystem": "glycolysis", + "confidence_score": 2, + "references": "PMID:123", + "rxnFrom": "manual", + "notes": "a reaction note", + "deltaG": -5.0, + "annotation": {"ec-code": ["1.1.1.1"]}, + } + ], + "genes": [ + {"id": "G1", "name": "gene one", "protein": "P12345", "annotation": {"uniprot": ["P12345"]}} + ], + "id": "testModel", + "name": "Test Model", + "compartments": {"c": "cytoplasm"}, + "version": "1.0", + "metaData": {"date": "2026-05-23", "taxonomy": "taxonomy/559292", "defaultLB": "-1000"}, + "ec-rxns": [{"id": "R1", "kcat": 100.0}], +} + + +@pytest.fixture +def yaml_file(tmp_path) -> Path: + p = tmp_path / "model.yml" + with open(p, "w", encoding="utf-8") as fh: + cobra_yaml.dump(RAVEN_DOC, fh) + return p + + +def test_standard_content(yaml_file): + model = read_yaml_model(yaml_file) + assert model.id == "testModel" + assert model.name == "Test Model" + assert {m.id for m in model.metabolites} == {"s_0001", "s_0002"} + r = model.reactions.get_by_id("R1") + assert r.bounds == (-1000.0, 1000.0) + assert r.subsystem == "glycolysis" + assert r.gene_reaction_rule == "G1" + + +def test_annotation_owned_by_cobra(yaml_file): + # smiles / ec-code / miriam live in the annotation block (cobra reads them) + model = read_yaml_model(yaml_file) + assert model.metabolites.get_by_id("s_0001").annotation["smiles"] == ["C1=NC2"] + assert model.metabolites.get_by_id("s_0001").annotation["kegg.compound"] == ["C00002"] + assert model.reactions.get_by_id("R1").annotation["ec-code"] == ["1.1.1.1"] + assert model.genes.get_by_id("G1").annotation["uniprot"] == ["P12345"] + + +def test_raven_only_fields_captured(yaml_file): + model = read_yaml_model(yaml_file) + a = model.metabolites.get_by_id("s_0001") + assert a.notes["inchis"] == "InChI=1S/CH4" + assert a.notes["deltaG"] == 12.5 + assert a.notes["note"] == "a metabolite note" # RAVEN metNotes string, no crash + assert a.notes["metFrom"] == "KEGG" + assert "smiles" not in a.notes # smiles stays in annotation + r = model.reactions.get_by_id("R1") + assert r.notes["confidence_score"] == 2 + assert r.notes["references"] == "PMID:123" + assert r.notes["rxnFrom"] == "manual" + assert r.notes["note"] == "a reaction note" + assert r.notes["deltaG"] == -5.0 + assert model.genes.get_by_id("G1").notes["protein"] == "P12345" + + +def test_model_level_extras(yaml_file): + model = read_yaml_model(yaml_file) + assert model.notes["metaData"]["taxonomy"] == "taxonomy/559292" + assert model.notes["version"] == "1.0" + assert model.notes["_yaml_sections"]["ec-rxns"][0]["kcat"] == 100.0 + + +def test_round_trip(yaml_file, tmp_path): + model = read_yaml_model(yaml_file) + out = tmp_path / "out.yml" + write_yaml_model(model, out) + reloaded = read_yaml_model(out) + + assert reloaded.id == "testModel" + assert reloaded.notes["version"] == "1.0" + assert reloaded.notes["metaData"]["taxonomy"] == "taxonomy/559292" + a = reloaded.metabolites.get_by_id("s_0001") + assert a.notes["deltaG"] == 12.5 + assert a.notes["note"] == "a metabolite note" + assert a.annotation["smiles"] == ["C1=NC2"] + r = reloaded.reactions.get_by_id("R1") + assert r.notes["confidence_score"] == 2 + assert reloaded.genes.get_by_id("G1").notes["protein"] == "P12345" + assert reloaded.notes["_yaml_sections"]["ec-rxns"][0]["id"] == "R1" + + +def test_extra_notes_not_dropped_when_free_text_note_present(yaml_file, tmp_path): + """An entry with both a RAVEN free-text note and an extra note keeps both on write.""" + model = read_yaml_model(yaml_file) + a = model.metabolites.get_by_id("s_0001") + a.notes["note"] = "free text" + a.notes["custom"] = "extra value" # a non-RAVEN note that must not be silently lost + out = tmp_path / "out.yml" + write_yaml_model(model, out) + text = out.read_text() + assert "extra value" in text # the leftover note survives serialization + + +def test_gzipped_round_trip(yaml_file, tmp_path): + # A .yml.gz path is transparently gzipped on write and read. + model = read_yaml_model(yaml_file) + out = tmp_path / "out.yml.gz" + write_yaml_model(model, out) + assert out.read_bytes()[:2] == b"\x1f\x8b" # gzip magic + reloaded = read_yaml_model(out) + assert reloaded.id == "testModel" + assert {m.id for m in reloaded.metabolites} == {"s_0001", "s_0002"} + + +def test_output_is_cobra_readable(yaml_file, tmp_path): + # The written file must load with stock cobra (it's cobra's native format). + model = read_yaml_model(yaml_file) + out = tmp_path / "out.yml" + write_yaml_model(model, out) + cobra_model = cobra.io.load_yaml_model(str(out)) + assert cobra_model.id == "testModel" + assert {m.id for m in cobra_model.metabolites} == {"s_0001", "s_0002"} + # RAVEN-only fields land in cobra notes; smiles in annotation + assert cobra_model.metabolites.get_by_id("s_0001").annotation["smiles"] == ["C1=NC2"] + + +def test_write_emits_raven_top_level_keys(yaml_file, tmp_path): + model = read_yaml_model(yaml_file) + out = tmp_path / "out.yml" + write_yaml_model(model, out) + text = out.read_text() + # RAVEN-only fields are lifted back to top-level entry keys, not buried in notes + assert "inchis:" in text + assert "deltaG:" in text + assert "confidence_score:" in text + assert "metaData:" in text + + +def test_legacy_id_in_metadata(tmp_path): + # Older RAVEN files nest id/name under metaData and have no top-level id. + legacy = { + "metabolites": [{"id": "a_c", "name": "A", "compartment": "c"}], + "reactions": [], + "genes": [], + "compartments": {"c": "cyt"}, + "metaData": {"id": "legacyModel", "name": "Legacy"}, + } + p = tmp_path / "legacy.yml" + with open(p, "w", encoding="utf-8") as fh: + cobra_yaml.dump(legacy, fh) + model = read_yaml_model(p) + assert model.id == "legacyModel" + assert model.name == "Legacy" + + +# Optional smoke test against a real model file if present. +_YEAST = Path("/home/eduardk/github/GECKO/tutorials/full_ecModel/models/yeast-GEM.yml") + + +@pytest.mark.skipif(not _YEAST.exists(), reason="real yeast-GEM.yml not available") +def test_real_yeast_gem_loads(): + model = read_yaml_model(_YEAST) + assert len(model.reactions) > 1000 + # legacy file: identity comes from metaData + assert model.id diff --git a/tests/test_localization.py b/tests/test_localization.py new file mode 100644 index 0000000..d177db5 --- /dev/null +++ b/tests/test_localization.py @@ -0,0 +1,227 @@ +"""Tests for raven_python.localization — predictor loaders + the MILP + apply (Phase 7).""" +from __future__ import annotations + +from textwrap import dedent + +import cobra +import pandas as pd +import pytest + +from raven_python.localization import ( + LocalizationProposal, + LocalizationResult, + LocalizationScores, + apply_localization, + load_deeploc, + load_wolfpsort, + predict_localization, +) + +# --------------------------------------------------------------------- loaders + +def test_load_wolfpsort_basic(tmp_path): + p = tmp_path / "wolf.txt" + p.write_text(dedent("""\ + # header comment + Gene1 cyto 13, nucl 7, mito 4 + Gene2: treating 9 X's as Glycines + Gene3 mito 20, cyto 2 + """)) + s = load_wolfpsort(p) + assert "Gene1" in s.genes + assert "Gene2" not in s.genes # the 'treating' line is skipped + assert "Gene3" in s.genes + # row-normalised to max=1: + assert s.df.loc["Gene1", "cyto"] == pytest.approx(1.0) # 13/13 + assert s.df.loc["Gene1", "nucl"] == pytest.approx(7 / 13) + assert s.df.loc["Gene3", "mito"] == pytest.approx(1.0) + assert s.df.loc["Gene3", "cyto"] == pytest.approx(0.1) + + +def test_load_deeploc_csv(tmp_path): + p = tmp_path / "deeploc.csv" + p.write_text(dedent("""\ + Protein_ID,Localizations,Signals,Cytoplasm,Nucleus,Mitochondrion + G1,Cytoplasm,,0.8,0.1,0.05 + G2,Mitochondrion,SP,0.05,0.15,0.9 + """)) + s = load_deeploc(p) + assert set(s.compartments) == {"Cytoplasm", "Nucleus", "Mitochondrion"} + # row-max → 1.0 + assert s.df.loc["G1", "Cytoplasm"] == pytest.approx(1.0) + assert s.df.loc["G2", "Mitochondrion"] == pytest.approx(1.0) + + +def test_localization_scores_with_compartments_rename(): + df = pd.DataFrame({"cyto": [1.0], "mito": [0.2]}, index=pd.Index(["g1"], name="gene_id")) + s = LocalizationScores(df).with_compartments({"cyto": "c", "mito": "m"}) + assert list(s.compartments) == ["c", "m"] + + +# ----------------------------------------------------------------- predict (toy) + +def _toy_two_compartment_model() -> cobra.Model: + """Single-compartment draft (everything in 'c'): + + A_c -(r1)-> B_c -(r2)-> C_c (r2 should move to 'm' per scores below) + Boundary EX_A imports A; EX_C drains C. + """ + m = cobra.Model("toy") + A, B, C = (cobra.Metabolite(x + "_c", name=x, compartment="c") for x in "ABC") + m.add_metabolites([A, B, C]) + + def rxn(rid, lb, ub, mets, gpr=None): + r = cobra.Reaction(rid, lower_bound=lb, upper_bound=ub) + r.add_metabolites(mets) + if gpr: + r.gene_reaction_rule = gpr + return r + m.add_reactions([rxn("EX_A", -1000, 0, {A: -1}), + rxn("EX_C", 0, 1000, {C: -1}), + rxn("r1", 0, 1000, {A: -1, B: 1}, "g1"), + rxn("r2", 0, 1000, {B: -1, C: 1}, "g2")]) + return m + + +def test_predict_empty_relocate_set_is_no_op(): + """An empty relocate set short-circuits to an empty proposal.""" + m = _toy_two_compartment_model() + scores = LocalizationScores(pd.DataFrame( + {"c": [1.0, 0.0], "m": [0.0, 1.0]}, index=pd.Index(["g1", "g2"], name="gene_id"))) + res = predict_localization(m, scores, reactions_to_relocate=[], apply=False) + assert isinstance(res, LocalizationProposal) + assert res.moved.empty + + +def test_predict_places_single_reaction(): + """Pass r2 in the relocate set; it goes to 'm' per scores.""" + m = _toy_two_compartment_model() + scores = LocalizationScores(pd.DataFrame( + {"c": [1.0, 0.0], "m": [0.0, 1.0]}, index=pd.Index(["g1", "g2"], name="gene_id"))) + res = predict_localization(m, scores, ["r2"], default_compartment="c", apply=False, + transport_cost=0.1) + assert isinstance(res, LocalizationProposal) + assert set(res.moved["rxn_id"]) == {"r2"} + assert res.moved.iloc[0]["to_compartment"] == "m" + + +def test_predict_apply_creates_compartment_metabolites_and_transports(): + """apply=True should mutate the (copy) model: r2 in m, and B/C transports added.""" + m = _toy_two_compartment_model() + scores = LocalizationScores(pd.DataFrame( + {"c": [1.0, 0.0], "m": [0.0, 1.0]}, index=pd.Index(["g1", "g2"], name="gene_id"))) + res = predict_localization(m, scores, ["r2"], default_compartment="c", apply=True, + transport_cost=0.05) + assert isinstance(res, LocalizationResult) + out = res.model + r2 = out.reactions.r2 + assert {mt.compartment for mt in r2.metabolites} == {"m"} # both substrates now in m + # B_m and C_m metabolite copies must exist: + assert "B_m" in out.metabolites and "C_m" in out.metabolites + # Transports tr_B_m and tr_C_m must be added (default c ↔ m): + transport_ids = {t.id for t in res.added_transports} + assert "tr_B_m" in transport_ids + assert "tr_C_m" in transport_ids + # Original model untouched (we copied). + assert m.reactions.r2.metabolites != r2.metabolites + assert "B_m" not in m.metabolites + + +def test_predict_unplaced_reaction_when_no_scored_gene(): + """A relocate-set reaction whose genes are all absent from scores is reported, not crashed.""" + m = _toy_two_compartment_model() + # Only g1 has scores; g2 (r2's gene) is absent → r2 is unplaceable. + scores = LocalizationScores(pd.DataFrame( + {"c": [1.0], "m": [0.0]}, index=pd.Index(["g1"], name="gene_id"))) + res = predict_localization(m, scores, ["r2"], apply=False) + assert isinstance(res, LocalizationProposal) + assert "r2" in res.unplaced_reactions + assert res.moved.empty # nothing actually placed + + +def test_predict_boundary_reactions_always_pinned(): + """Boundary reactions in the relocate set are silently filtered out.""" + m = _toy_two_compartment_model() + scores = LocalizationScores(pd.DataFrame( + {"c": [1.0, 0.0], "m": [0.0, 1.0]}, index=pd.Index(["g1", "g2"], name="gene_id"))) + res = predict_localization(m, scores, ["EX_A", "EX_C", "r2"], apply=False, + transport_cost=0.1) + # Only r2 should appear in the proposal — boundaries dropped. + assert set(res.moved["rxn_id"]) == {"r2"} + + +def test_predict_default_compartment_validated(): + m = _toy_two_compartment_model() + scores = LocalizationScores(pd.DataFrame( + {"c": [1.0], "m": [0.0]}, index=pd.Index(["g2"], name="gene_id"))) + with pytest.raises(ValueError, match="default_compartment"): + predict_localization(m, scores, ["r2"], default_compartment="x", apply=False) + + +def test_apply_localization_idempotent_on_empty_proposal(): + """An empty proposal (no moves, no transports) shouldn't change the model.""" + m = _toy_two_compartment_model() + empty = LocalizationProposal( + moved=pd.DataFrame(columns=["rxn_id", "from_compartment", "to_compartment"]), + added_transports=pd.DataFrame(columns=["met_id", "compartment"]), + gene_compartments={}) + out, added = apply_localization(m, empty) + assert len(out.reactions) == len(m.reactions) + assert added == [] + + +# ----------------------------------------- multi-compartment scoring (NEW) + +def test_predict_multi_compartment_when_secondary_score_beats_penalty(): + """Dual-localised gene: secondary compartment score 0.8 > penalty 0.3 → gene lands in + both compartments. Two reactions sharing one gene each placed in their best + compartment without contradicting the gene assignment.""" + m = cobra.Model("dual") + A_c = cobra.Metabolite("A_c", compartment="c") + B_c = cobra.Metabolite("B_c", compartment="c") + m.add_metabolites([A_c, B_c]) + r1 = cobra.Reaction("r1", lower_bound=0, upper_bound=1000) + r1.add_metabolites({A_c: -1, B_c: 1}) + r1.gene_reaction_rule = "g_dual" + r2 = cobra.Reaction("r2", lower_bound=0, upper_bound=1000) + r2.add_metabolites({A_c: -1, B_c: 1}) + r2.gene_reaction_rule = "g_dual" + m.add_reactions([r1, r2]) + # g_dual scores: c=1.0 (primary), m=0.8 (secondary). Penalty 0.3 — secondary worth it. + scores = LocalizationScores(pd.DataFrame( + {"c": [1.0], "m": [0.8]}, index=pd.Index(["g_dual"], name="gene_id"))) + res = predict_localization(m, scores, ["r2"], default_compartment="c", apply=False, + transport_cost=0.0, multi_compartment_penalty=0.3) + # The gene should land in BOTH c and m (primary free + 0.8 - 0.3 > 0 for secondary). + assert set(res.gene_compartments["g_dual"]) == {"c", "m"} + + +def test_predict_mono_when_secondary_score_below_penalty(): + """Same as above but penalty 0.9 > secondary score 0.8 → gene mono-localises (c only).""" + m = cobra.Model("mono") + A_c = cobra.Metabolite("A_c", compartment="c") + B_c = cobra.Metabolite("B_c", compartment="c") + m.add_metabolites([A_c, B_c]) + r2 = cobra.Reaction("r2", lower_bound=0, upper_bound=1000) + r2.add_metabolites({A_c: -1, B_c: 1}) + r2.gene_reaction_rule = "g_dual" + m.add_reactions([r2]) + scores = LocalizationScores(pd.DataFrame( + {"c": [1.0], "m": [0.8]}, index=pd.Index(["g_dual"], name="gene_id"))) + res = predict_localization(m, scores, ["r2"], default_compartment="c", apply=False, + transport_cost=0.0, multi_compartment_penalty=0.9) + # Penalty exceeds secondary score → only the primary compartment. + assert res.gene_compartments["g_dual"] == ["c"] + + +def test_predict_high_penalty_forces_mono_localisation(): + """Very high penalty effectively bans extra compartments.""" + m = _toy_two_compartment_model() + scores = LocalizationScores(pd.DataFrame( + {"c": [0.4, 0.3], "m": [0.5, 0.6]}, + index=pd.Index(["g1", "g2"], name="gene_id"))) + res = predict_localization(m, scores, ["r1", "r2"], default_compartment="c", apply=False, + transport_cost=0.0, multi_compartment_penalty=1000.0) + # With penalty so high, every gene gets exactly one compartment. + for g, comps in res.gene_compartments.items(): + assert len(comps) == 1, f"{g} landed in {comps} with prohibitive penalty" diff --git a/tests/test_manipulation_add.py b/tests/test_manipulation_add.py new file mode 100644 index 0000000..2a3a9d3 --- /dev/null +++ b/tests/test_manipulation_add.py @@ -0,0 +1,278 @@ +"""Tests for raven_python.manipulation.add (addRxns port).""" +import cobra +import pytest + +from raven_python.manipulation import add_reactions_from_equations +from raven_python.utils.parse import parse_name_comp + + +@pytest.fixture +def model(): + m = cobra.Model("t") + m.add_metabolites( + [ + cobra.Metabolite("atp_c", name="ATP", compartment="c"), + cobra.Metabolite("h2o_c", name="H2O", compartment="c"), + cobra.Metabolite("adp_c", name="ADP", compartment="c"), + cobra.Metabolite("pi_c", name="phosphate", compartment="c"), + ] + ) + return m + + +# --- parse_name_comp ------------------------------------------------------- + +@pytest.mark.parametrize( + "token,expected", + [ + ("ATP[c]", ("ATP", "c")), + ("ATP", ("ATP", None)), + (" ATP[c] ", ("ATP", "c")), + ("weird[name][m]", ("weird[name]", "m")), + ], +) +def test_parse_name_comp(token, expected): + assert parse_name_comp(token) == expected + + +# --- id mode (eqnType 1) --------------------------------------------------- + +def test_add_by_id_basic_and_reversibility(model): + (rxn,) = add_reactions_from_equations( + model, [{"id": "R1", "equation": "atp_c + h2o_c <=> adp_c + pi_c"}] + ) + assert rxn.id == "R1" + assert rxn.reversibility is True + assert {m.id: rxn.get_coefficient(m.id) for m in rxn.metabolites} == { + "atp_c": -1.0, + "h2o_c": -1.0, + "adp_c": 1.0, + "pi_c": 1.0, + } + + +def test_irreversible_arrows(model): + rxns = add_reactions_from_equations( + model, + [ + {"id": "R1", "equation": "atp_c --> adp_c"}, + {"id": "R2", "equation": "atp_c => adp_c"}, + ], + ) + for r in rxns: + assert r.lower_bound == 0.0 + assert r.reversibility is False + + +def test_coefficients(model): + (rxn,) = add_reactions_from_equations( + model, [{"id": "R1", "equation": "2 atp_c + 1.5 h2o_c --> adp_c"}] + ) + assert rxn.get_coefficient("atp_c") == -2.0 + assert rxn.get_coefficient("h2o_c") == -1.5 + + +def test_id_mode_creates_new_met_in_compartment(model): + add_reactions_from_equations( + model, + [{"id": "R1", "equation": "atp_c --> amp_c"}], + compartment="c", + ) + assert "amp_c" in model.metabolites + assert model.metabolites.get_by_id("amp_c").compartment == "c" + + +def test_id_mode_new_met_without_compartment_errors(model): + with pytest.raises(ValueError, match="no compartment"): + add_reactions_from_equations(model, [{"id": "R1", "equation": "atp_c --> amp_c"}]) + + +# --- name mode (eqnType 2) ------------------------------------------------- + +def test_name_mode_matches_existing_by_name(model): + (rxn,) = add_reactions_from_equations( + model, + [{"id": "R1", "equation": "ATP + H2O <=> ADP + phosphate"}], + mets_by="name", + compartment="c", + ) + # resolved to the existing _c metabolites, not new ones + assert {m.id for m in rxn.metabolites} == {"atp_c", "h2o_c", "adp_c", "pi_c"} + assert len(model.metabolites) == 4 + + +def test_name_mode_creates_new_met_with_auto_id(model): + add_reactions_from_equations( + model, + [{"id": "R1", "equation": "ATP --> AMP"}], + mets_by="name", + compartment="c", + ) + new = [m for m in model.metabolites if m.name == "AMP"] + assert len(new) == 1 + assert new[0].id == "m1" + assert new[0].compartment == "c" + + +def test_name_mode_requires_compartment(model): + with pytest.raises(ValueError, match="needs a compartment"): + add_reactions_from_equations( + model, [{"id": "R1", "equation": "ATP --> ADP"}], mets_by="name" + ) + + +# --- name[comp] mode (eqnType 3) ------------------------------------------- + +def test_name_comp_syntax(model): + model.add_metabolites([cobra.Metabolite("atp_m", name="ATP", compartment="m")]) + (rxn,) = add_reactions_from_equations( + model, + [{"id": "R1", "equation": "ATP[c] --> ATP[m]"}], + mets_by="name", + compartment="c", + ) + # matched ATP in two different compartments by name[comp] + assert {m.id for m in rxn.metabolites} == {"atp_c", "atp_m"} + + +# --- genes ----------------------------------------------------------------- + +def test_gene_rule_auto_creates_genes(model): + (rxn,) = add_reactions_from_equations( + model, + [{"id": "R1", "equation": "atp_c --> adp_c", "gene_reaction_rule": "G1 and G2"}], + ) + assert {g.id for g in rxn.genes} == {"G1", "G2"} + assert {g.id for g in model.genes} == {"G1", "G2"} + + +def test_strict_genes_errors_on_unknown(model): + with pytest.raises(ValueError, match="genes not in the model"): + add_reactions_from_equations( + model, + [{"id": "R1", "equation": "atp_c --> adp_c", "gene_reaction_rule": "G1"}], + allow_new_genes=False, + ) + + +def test_strict_genes_ok_when_present(model): + model.genes.append(cobra.core.gene.Gene("G1")) + (rxn,) = add_reactions_from_equations( + model, + [{"id": "R1", "equation": "atp_c --> adp_c", "gene_reaction_rule": "G1"}], + allow_new_genes=False, + ) + assert rxn.gene_reaction_rule == "G1" + + +# --- guards & extras ------------------------------------------------------- + +def test_duplicate_reaction_id_errors(model): + model.add_reactions([cobra.Reaction("R1")]) + with pytest.raises(ValueError, match="already exists"): + add_reactions_from_equations(model, [{"id": "R1", "equation": "atp_c --> adp_c"}]) + + +def test_strict_mets_errors(model): + with pytest.raises(ValueError, match="allow_new_mets"): + add_reactions_from_equations( + model, + [{"id": "R1", "equation": "atp_c --> amp_c"}], + compartment="c", + allow_new_mets=False, + ) + + +def test_explicit_bounds_override_arrow(model): + (rxn,) = add_reactions_from_equations( + model, + [{"id": "R1", "equation": "atp_c <=> adp_c", "bounds": (0, 50), "name": "myrxn"}], + ) + assert rxn.bounds == (0, 50) + assert rxn.name == "myrxn" + + +def test_net_zero_metabolite_dropped(model): + # atp_c on both sides nets to zero and is removed. + (rxn,) = add_reactions_from_equations( + model, [{"id": "R1", "equation": "atp_c + h2o_c --> atp_c + adp_c"}] + ) + assert "atp_c" not in {m.id for m in rxn.metabolites} + assert {m.id for m in rxn.metabolites} == {"h2o_c", "adp_c"} + + +def test_missing_equation_errors(model): + with pytest.raises(ValueError, match="missing required 'equation'"): + add_reactions_from_equations(model, [{"id": "R1"}]) + + +def test_no_arrow_errors(model): + with pytest.raises(ValueError, match="No reaction arrow"): + add_reactions_from_equations(model, [{"id": "R1", "equation": "atp_c + h2o_c"}]) + + +# --- regression: leading-number metabolite name (known_issues.md A1) ------- + +def test_name_mode_preserves_leading_number_name(model): + """A metabolite name that begins with a number isn't misparsed as a coefficient. + + Before the fix the token ``"2 oxoglutarate"`` was parsed as ``(coeff=2, name="oxoglutarate")`` + silently — corrupting the stoichiometry. The resolver now prefers the full + token when it matches an existing metabolite name. + """ + model.add_metabolites([ + cobra.Metabolite("akg_c", name="2 oxoglutarate", compartment="c"), + ]) + (rxn,) = add_reactions_from_equations( + model, + [{"id": "R1", "equation": "ATP + 2 oxoglutarate --> ADP"}], + mets_by="name", + compartment="c", + ) + assert rxn.get_coefficient("akg_c") == -1.0 # not -2.0 + assert rxn.get_coefficient("atp_c") == -1.0 + + +def test_name_mode_coefficient_still_works_without_collision(model): + """If the full token doesn't match anything, fall back to coefficient split.""" + (rxn,) = add_reactions_from_equations( + model, + [{"id": "R1", "equation": "2 ATP + H2O --> ADP + phosphate"}], + mets_by="name", + compartment="c", + ) + assert rxn.get_coefficient("atp_c") == -2.0 + + +# --- regression: empty-stoichiometry warning (known_issues.md A2) ---------- + +def test_empty_stoichiometry_warns(model): + """All-terms-cancel reaction warns instead of silently shipping an empty rxn.""" + with pytest.warns(UserWarning, match="no net metabolites"): + (rxn,) = add_reactions_from_equations( + model, [{"id": "R1", "equation": "atp_c --> atp_c"}] + ) + assert len(rxn.metabolites) == 0 + + +# --- regression: unknown-compartment warning (known_issues.md B2) ---------- + +def test_id_mode_unknown_compartment_warns(model): + """A typo'd compartment used to silently produce a one-met ghost compartment + in id mode (the name/[comp] path used to validate, id mode never did).""" + with pytest.warns(UserWarning, match="unregistered compartment 'cyto'"): + add_reactions_from_equations( + model, + [{"id": "R1", "equation": "atp_c --> amp_c"}], + compartment="cyto", # typo for 'c' + ) + + +def test_name_comp_unknown_compartment_warns(model): + """Same defensive check in the name[comp] path when allow_new_mets=True.""" + with pytest.warns(UserWarning, match="unregistered compartment 'mito'"): + add_reactions_from_equations( + model, + [{"id": "R1", "equation": "ATP[c] --> AMP[mito]"}], + mets_by="name", + ) diff --git a/tests/test_manipulation_change.py b/tests/test_manipulation_change.py new file mode 100644 index 0000000..8d54f58 --- /dev/null +++ b/tests/test_manipulation_change.py @@ -0,0 +1,93 @@ +"""Tests for raven_python.manipulation.change (changeRxns port).""" +import cobra +import pytest + +from raven_python.manipulation import add_reactions_from_equations, change_reaction_equations + + +@pytest.fixture +def model(): + m = cobra.Model("t") + m.add_metabolites( + [ + cobra.Metabolite("a_c", name="A", compartment="c"), + cobra.Metabolite("b_c", name="B", compartment="c"), + cobra.Metabolite("c_c", name="C", compartment="c"), + ] + ) + add_reactions_from_equations( + m, + [ + { + "id": "R1", + "equation": "a_c <=> b_c", + "name": "first", + "bounds": (-30, 70), + "gene_reaction_rule": "G1 or G2", + "subsystem": "sub", + }, + {"id": "R2", "equation": "a_c --> c_c"}, + ], + ) + return m + + +def test_changes_stoichiometry(model): + (rxn,) = change_reaction_equations(model, {"R1": "a_c --> 2 c_c"}) + assert rxn.id == "R1" + assert {m.id: rxn.get_coefficient(m.id) for m in rxn.metabolites} == { + "a_c": -1.0, + "c_c": 2.0, + } + + +def test_preserves_other_fields(model): + before = model.reactions.get_by_id("R1") + name, bounds, subsystem = before.name, before.bounds, before.subsystem + genes = {g.id for g in before.genes} + + change_reaction_equations(model, {"R1": "a_c --> c_c"}) + + after = model.reactions.get_by_id("R1") + assert after.name == name + assert after.bounds == bounds # bounds untouched, per RAVEN + assert after.subsystem == subsystem + assert {g.id for g in after.genes} == genes + + +def test_preserves_reaction_order(model): + order_before = [r.id for r in model.reactions] + change_reaction_equations(model, {"R1": "b_c --> c_c"}) + assert [r.id for r in model.reactions] == order_before + + +def test_bounds_not_changed_by_arrow(model): + # R1 starts reversible (-30, 70); a --> arrow must NOT make it irreversible. + change_reaction_equations(model, {"R1": "a_c --> b_c"}) + assert model.reactions.get_by_id("R1").bounds == (-30, 70) + + +def test_name_mode(model): + (rxn,) = change_reaction_equations( + model, {"R2": "A --> C"}, mets_by="name", compartment="c" + ) + assert {m.id for m in rxn.metabolites} == {"a_c", "c_c"} + + +def test_can_introduce_new_met(model): + change_reaction_equations( + model, {"R2": "a_c --> d_c"}, compartment="c" + ) + assert "d_c" in model.metabolites + assert model.reactions.get_by_id("R2").get_coefficient("d_c") == 1.0 + + +def test_unknown_reaction_errors(model): + with pytest.raises(ValueError, match="not found"): + change_reaction_equations(model, {"NOPE": "a_c --> b_c"}) + + +def test_multiple_reactions(model): + changed = change_reaction_equations(model, {"R1": "a_c --> c_c", "R2": "b_c --> c_c"}) + assert [r.id for r in changed] == ["R1", "R2"] + assert model.reactions.get_by_id("R2").get_coefficient("b_c") == -1.0 diff --git a/tests/test_manipulation_compartments.py b/tests/test_manipulation_compartments.py new file mode 100644 index 0000000..4d3fb3b --- /dev/null +++ b/tests/test_manipulation_compartments.py @@ -0,0 +1,139 @@ +"""Tests for manipulation/compartments.py — merge_compartments + copy_to_compartment.""" +from __future__ import annotations + +import cobra +import pytest + +from raven_python.manipulation.compartments import copy_to_compartment, merge_compartments + + +def _two_compartment_model() -> cobra.Model: + """A_c → B_c, A_m → B_m, and a transport A_c ↔ A_m. Multi-compartment toy.""" + m = cobra.Model("toy") + A_c = cobra.Metabolite("A_c", name="A", compartment="c") + A_m = cobra.Metabolite("A_m", name="A", compartment="m") + B_c = cobra.Metabolite("B_c", name="B", compartment="c") + B_m = cobra.Metabolite("B_m", name="B", compartment="m") + m.add_metabolites([A_c, A_m, B_c, B_m]) + + def rxn(rid, lb, ub, mets, gpr=None): + r = cobra.Reaction(rid, lower_bound=lb, upper_bound=ub) + r.add_metabolites(mets) + if gpr: + r.gene_reaction_rule = gpr + return r + m.add_reactions([rxn("r_c", 0, 1000, {A_c: -1, B_c: 1}, "g1"), + rxn("r_m", 0, 1000, {A_m: -1, B_m: 1}, "g2"), + rxn("tr_A", -1000, 1000, {A_c: -1, A_m: 1})]) + return m + + +# ----------------------------------------------------------------- merge_compartments + +def test_merge_compartments_collapses_to_one(): + """A_c + A_m → A; B_c + B_m → B; transport A_c↔A_m self-cancels and is dropped.""" + m = _two_compartment_model() + merged, deleted, dupes = merge_compartments(m) + # Only the base ids survive. + assert {x.id for x in merged.metabolites} == {"A", "B"} + # The transport reaction collapsed (A → A) and was deleted. + assert "tr_A" in deleted + # r_c and r_m are now both A → B; one of them gets deduplicated. + surviving = {r.id for r in merged.reactions} + assert len(surviving & {"r_c", "r_m"}) == 1 + assert (set(dupes) | (surviving & {"r_c", "r_m"})) == {"r_c", "r_m"} + + +def test_merge_compartments_preserves_gpr_and_subsystem(): + m = _two_compartment_model() + m.reactions.r_c.subsystem = "carbo" + merged, _, _ = merge_compartments(m) + survivor = next(r for r in merged.reactions if r.id in {"r_c", "r_m"}) + # The survivor keeps its gene rule + subsystem (cobra may sometimes lose them + # through copy; we set them explicitly). + assert survivor.gene_reaction_rule in {"g1", "g2"} + if survivor.id == "r_c": + assert survivor.subsystem == "carbo" + + +def test_merge_compartments_keeps_single_met_reactions_when_asked(): + """drop_single_metabolite_reactions=False keeps the collapsed transport (now A → A, + which is empty stoichiometry after net-cancellation — still dropped, but the *one-met* + case is the more interesting one). Use a uniport pattern to exercise it.""" + m = cobra.Model("uniport") + A_c = cobra.Metabolite("A_c", name="A", compartment="c") + A_m = cobra.Metabolite("A_m", name="A", compartment="m") + H_c = cobra.Metabolite("H_c", name="H", compartment="c") + m.add_metabolites([A_c, A_m, H_c]) + # H+ symport: A_c + H_c → A_m. After merge: A + H → A → leaves H. + sym = cobra.Reaction("sym", lower_bound=0, upper_bound=1000) + sym.add_metabolites({A_c: -1, H_c: -1, A_m: 1}) + m.add_reactions([sym]) + merged_drop, deleted_drop, _ = merge_compartments(m, drop_single_metabolite_reactions=True) + assert "sym" in deleted_drop + merged_keep, deleted_keep, _ = merge_compartments(m, drop_single_metabolite_reactions=False) + # With keep, sym survives as a one-met reaction (consumes H). + assert "sym" not in deleted_keep + assert "sym" in {r.id for r in merged_keep.reactions} + + +def test_merge_compartments_deduplicate_off_keeps_both(): + m = _two_compartment_model() + merged, _, dupes = merge_compartments(m, deduplicate_reactions=False) + assert dupes == [] + assert {"r_c", "r_m"} <= {r.id for r in merged.reactions} + + +# ----------------------------------------------------------------- copy_to_compartment + +def test_copy_to_compartment_basic(): + """Copy r_c into 'p' (peroxisome): a new reaction r_c_p with metabolites in p.""" + m = _two_compartment_model() + out, new_rxns, new_mets = copy_to_compartment(m, ["r_c"], "p", + target_compartment_name="peroxisome") + assert "r_c_p" in [r.id for r in out.reactions] + new_r = out.reactions.r_c_p + assert {x.compartment for x in new_r.metabolites} == {"p"} + assert "A_p" in [x.id for x in out.metabolites] + assert "B_p" in [x.id for x in out.metabolites] + assert new_rxns == ["r_c_p"] + assert set(new_mets) == {"A_p", "B_p"} + # Original still there. + assert "r_c" in [r.id for r in out.reactions] + + +def test_copy_to_compartment_preserves_gpr_and_bounds(): + m = _two_compartment_model() + out, _, _ = copy_to_compartment(m, ["r_c"], "p") + new_r = out.reactions.r_c_p + assert new_r.gene_reaction_rule == "g1" + assert new_r.lower_bound == 0 and new_r.upper_bound == 1000 + + +def test_copy_to_compartment_delete_original_is_a_move(): + m = _two_compartment_model() + out, _, _ = copy_to_compartment(m, ["r_c"], "p", delete_original=True) + assert "r_c" not in [r.id for r in out.reactions] + assert "r_c_p" in [r.id for r in out.reactions] + + +def test_copy_to_compartment_idempotent(): + """Calling twice doesn't add the reaction twice.""" + m = _two_compartment_model() + out, _, _ = copy_to_compartment(m, ["r_c"], "p") + out2, new_rxns, _ = copy_to_compartment(out, ["r_c"], "p") + assert new_rxns == [] # nothing added on second call + assert len([r for r in out2.reactions if r.id == "r_c_p"]) == 1 + + +def test_copy_to_compartment_unknown_reaction_raises(): + m = _two_compartment_model() + with pytest.raises(ValueError, match="not in model"): + copy_to_compartment(m, ["does_not_exist"], "p") + + +def test_copy_to_compartment_custom_suffix(): + m = _two_compartment_model() + out, new_rxns, _ = copy_to_compartment(m, ["r_c"], "p", id_suffix="copy1") + assert new_rxns == ["r_c_copy1"] + assert "A_copy1" in [x.id for x in out.metabolites] diff --git a/tests/test_manipulation_expand.py b/tests/test_manipulation_expand.py new file mode 100644 index 0000000..08cd2f2 --- /dev/null +++ b/tests/test_manipulation_expand.py @@ -0,0 +1,288 @@ +"""Tests for expand_model (RAVEN expandModel.m) — splitting isozymes into reactions. + +Adopted from geckopy's tests/test_expand.py. +""" +import cobra + +from raven_python.manipulation import expand_model +from raven_python.manipulation.expand import _gpr_to_dnf + +# --------------------------------------------------------------------------- # +# DNF conversion (internal helper, worth testing directly) +# --------------------------------------------------------------------------- # + +def _dnf_from_gpr_string(gpr_str: str) -> list[list[str]]: + from cobra.core.gene import GPR + + gpr = GPR.from_string(gpr_str) + return _gpr_to_dnf(gpr) + + +def test_dnf_empty_gpr(): + assert _dnf_from_gpr_string("") == [] + + +def test_dnf_single_gene(): + assert _dnf_from_gpr_string("g1") == [["g1"]] + + +def test_dnf_simple_and(): + assert _dnf_from_gpr_string("g1 and g2") == [["g1", "g2"]] + + +def test_dnf_simple_or(): + assert _dnf_from_gpr_string("g1 or g2") == [["g1"], ["g2"]] + + +def test_dnf_or_of_ands(): + assert _dnf_from_gpr_string("(g1 and g2) or (g3 and g4)") == [ + ["g1", "g2"], + ["g3", "g4"], + ] + + +def test_dnf_distributes_and_over_or(): + result = _dnf_from_gpr_string("g1 and (g2 or g3)") + assert result == [["g1", "g2"], ["g1", "g3"]] + + +def test_dnf_triple_or(): + assert _dnf_from_gpr_string("g1 or g2 or g3") == [ + ["g1"], ["g2"], ["g3"], + ] + + +def test_dnf_preserves_gene_order_within_clause(): + result = _dnf_from_gpr_string("g3 and g1 and g2") + assert result == [["g3", "g1", "g2"]] + + +# --------------------------------------------------------------------------- # +# expand_model +# --------------------------------------------------------------------------- # + +def _build_model( + reactions: list[tuple[str, dict[str, float], float, float, str]], +) -> cobra.Model: + """Build from (rxn_id, {met_id: coef}, lb, ub, gpr) tuples.""" + model = cobra.Model("test") + mets: dict[str, cobra.Metabolite] = {} + for _, stoich, _, _, _ in reactions: + for met_id in stoich: + if met_id not in mets: + mets[met_id] = cobra.Metabolite(met_id, compartment="c") + + for rxn_id, stoich, lb, ub, gpr in reactions: + rxn = cobra.Reaction(rxn_id) + rxn.lower_bound = lb + rxn.upper_bound = ub + rxn.add_metabolites({mets[m]: c for m, c in stoich.items()}) + if gpr: + rxn.gene_reaction_rule = gpr + model.add_reactions([rxn]) + return model + + +def test_does_not_expand_reaction_without_gpr(): + model = _build_model([("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "")]) + added = expand_model(model) + assert added == [] + assert "r1" in {r.id for r in model.reactions} + + +def test_does_not_expand_single_and_clause(): + model = _build_model([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 and g2"), + ]) + added = expand_model(model) + assert added == [] + r1 = model.reactions.get_by_id("r1") + assert r1.gene_reaction_rule == "g1 and g2" + + +def test_does_not_expand_single_gene(): + model = _build_model([("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1")]) + added = expand_model(model) + assert added == [] + assert model.reactions.get_by_id("r1").gene_reaction_rule == "g1" + + +def test_splits_simple_or_into_two_reactions(): + model = _build_model([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"), + ]) + added = expand_model(model) + + assert added == ["r1_EXP_1", "r1_EXP_2"] + rxn_ids = {r.id for r in model.reactions} + assert "r1" not in rxn_ids + assert "r1_EXP_1" in rxn_ids + assert "r1_EXP_2" in rxn_ids + + assert model.reactions.get_by_id("r1_EXP_1").gene_reaction_rule == "g1" + assert model.reactions.get_by_id("r1_EXP_2").gene_reaction_rule == "g2" + + +def test_splits_or_of_ands(): + model = _build_model([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, + "(g1 and g2) or (g3 and g4)"), + ]) + added = expand_model(model) + + assert added == ["r1_EXP_1", "r1_EXP_2"] + assert model.reactions.get_by_id("r1_EXP_1").gene_reaction_rule == "g1 and g2" + assert model.reactions.get_by_id("r1_EXP_2").gene_reaction_rule == "g3 and g4" + + +def test_distributes_and_over_or(): + model = _build_model([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, + "g1 and (g2 or g3)"), + ]) + added = expand_model(model) + + assert added == ["r1_EXP_1", "r1_EXP_2"] + assert model.reactions.get_by_id("r1_EXP_1").gene_reaction_rule == "g1 and g2" + assert model.reactions.get_by_id("r1_EXP_2").gene_reaction_rule == "g1 and g3" + + +def test_expanded_reactions_inherit_stoichiometry_and_bounds(): + model = _build_model([ + ("r1", {"A": -1.0, "B": 2.0}, -500.0, 1500.0, "g1 or g2"), + ]) + expand_model(model) + + for suffix in ("_EXP_1", "_EXP_2"): + rxn = model.reactions.get_by_id(f"r1{suffix}") + assert rxn.bounds == (-500.0, 1500.0) + stoich = {m.id: c for m, c in rxn.metabolites.items()} + assert stoich == {"A": -1.0, "B": 2.0} + + +def test_expanded_reactions_inherit_name_and_subsystem(): + model = _build_model([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"), + ]) + r1 = model.reactions.get_by_id("r1") + r1.name = "an isozyme-catalyzed reaction" + r1.subsystem = "central metabolism" + + expand_model(model) + + for suffix in ("_EXP_1", "_EXP_2"): + rxn = model.reactions.get_by_id(f"r1{suffix}") + assert rxn.name == "an isozyme-catalyzed reaction" + assert rxn.subsystem == "central metabolism" + + +def test_multiple_reactions_expand_independently(): + model = _build_model([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"), + ("r2", {"B": -1.0, "C": 1.0}, 0.0, 1000.0, "g3 and g4"), + ("r3", {"C": -1.0, "D": 1.0}, 0.0, 1000.0, + "(g5 and g6) or g7 or (g8 and g9)"), + ]) + added = expand_model(model) + + assert added == sorted([ + "r1_EXP_1", "r1_EXP_2", + "r3_EXP_1", "r3_EXP_2", "r3_EXP_3", + ]) + + rxn_ids = {r.id for r in model.reactions} + assert "r2" in rxn_ids + assert "r1" not in rxn_ids + assert "r3" not in rxn_ids + + assert model.reactions.get_by_id("r2").gene_reaction_rule == "g3 and g4" + assert model.reactions.get_by_id("r3_EXP_1").gene_reaction_rule == "g5 and g6" + assert model.reactions.get_by_id("r3_EXP_2").gene_reaction_rule == "g7" + assert model.reactions.get_by_id("r3_EXP_3").gene_reaction_rule == "g8 and g9" + + +def test_expanded_reaction_has_correct_gene_set(): + model = _build_model([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, + "(g1 and g2) or (g3 and g4)"), + ]) + expand_model(model) + + r1_1 = model.reactions.get_by_id("r1_EXP_1") + assert {g.id for g in r1_1.genes} == {"g1", "g2"} + + r1_2 = model.reactions.get_by_id("r1_EXP_2") + assert {g.id for g in r1_2.genes} == {"g3", "g4"} + + +def test_expansion_is_idempotent_in_the_no_op_sense(): + model = _build_model([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"), + ("r2", {"B": -1.0, "C": 1.0}, 0.0, 1000.0, "g3 and g4"), + ]) + expand_model(model) + ids_before = {r.id for r in model.reactions} + + second = expand_model(model) + assert second == [] + + ids_after = {r.id for r in model.reactions} + assert ids_after == ids_before + + +def test_empty_model_is_unchanged(): + model = cobra.Model("empty") + assert expand_model(model) == [] + + +# --------------------------------------------------------------------------- # +# Annotation and notes propagation +# --------------------------------------------------------------------------- # + +def test_expanded_reactions_inherit_annotation_and_notes(): + model = _build_model([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"), + ]) + r1 = model.reactions.get_by_id("r1") + r1.annotation["ec-code"] = "1.2.3.4" + r1.annotation["sbo"] = "SBO:0000176" + r1.notes["custom"] = "hello" + + expand_model(model) + + for suffix in ("_EXP_1", "_EXP_2"): + rxn = model.reactions.get_by_id(f"r1{suffix}") + assert rxn.annotation["ec-code"] == "1.2.3.4" + assert rxn.annotation["sbo"] == "SBO:0000176" + assert rxn.notes["custom"] == "hello" + + +def test_expanded_reaction_annotation_is_independent_of_parent(): + """Mutating one expanded reaction's annotation must not affect siblings.""" + model = _build_model([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"), + ]) + model.reactions.get_by_id("r1").annotation["ec-code"] = ["1.2.3.4"] + + expand_model(model) + + r1_1 = model.reactions.get_by_id("r1_EXP_1") + r1_2 = model.reactions.get_by_id("r1_EXP_2") + r1_1.annotation["ec-code"].append("9.9.9.9") + assert r1_2.annotation["ec-code"] == ["1.2.3.4"] + + +def test_objective_coefficient_preserved_on_expansion(): + """An expanded reaction's isozyme copies retain the original objective coefficient.""" + m = cobra.Model("o") + a, b = (cobra.Metabolite(x, compartment="c") for x in "ab") + m.add_metabolites([a, b]) + r = cobra.Reaction("r1", lower_bound=0, upper_bound=1000) + r.add_metabolites({a: -1, b: 1}) + r.gene_reaction_rule = "g1 or g2" + m.add_reactions([r]) + m.objective = "r1" # objective on the soon-to-be-expanded reaction + + expand_model(m) + coeffs = {rx.id: rx.objective_coefficient for rx in m.reactions} + assert coeffs == {"r1_EXP_1": 1.0, "r1_EXP_2": 1.0} # objective survives on both copies diff --git a/tests/test_manipulation_irreversible.py b/tests/test_manipulation_irreversible.py new file mode 100644 index 0000000..e211fa3 --- /dev/null +++ b/tests/test_manipulation_irreversible.py @@ -0,0 +1,144 @@ +"""Tests for convert_to_irreversible (RAVEN convertToIrrev.m). + +Adopted from geckopy's tests/test_preprocess.py (the convert_to_irreversible subset). +Exchange reactions are excluded from the split, matching MATLAB behavior. +""" +import cobra + +from raven_python.manipulation import convert_to_irreversible + + +def _build_model_with_bounds( + reactions: list[tuple[str, dict[str, float], float, float]], +) -> cobra.Model: + """Build from (rxn_id, {met_id: coef}, lb, ub) tuples.""" + model = cobra.Model("test") + mets: dict[str, cobra.Metabolite] = {} + for _, stoich, _, _ in reactions: + for met_id in stoich: + if met_id not in mets: + mets[met_id] = cobra.Metabolite(met_id, compartment="c") + + for rxn_id, stoich, lb, ub in reactions: + rxn = cobra.Reaction(rxn_id) + rxn.lower_bound = lb + rxn.upper_bound = ub + rxn.add_metabolites({mets[m]: c for m, c in stoich.items()}) + model.add_reactions([rxn]) + return model + + +def test_splits_single_reversible_non_exchange(): + model = _build_model_with_bounds([ + ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0), + ]) + + added = convert_to_irreversible(model) + assert added == ["r1_REV"] + + fwd = model.reactions.get_by_id("r1") + rev = model.reactions.get_by_id("r1_REV") + + assert fwd.bounds == (0.0, 1000.0) + assert {m.id: c for m, c in fwd.metabolites.items()} == {"A": -1.0, "B": 1.0} + + assert rev.bounds == (0.0, 500.0) + assert {m.id: c for m, c in rev.metabolites.items()} == {"A": 1.0, "B": -1.0} + + +def test_does_not_split_forward_only_reaction(): + model = _build_model_with_bounds([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0), + ]) + added = convert_to_irreversible(model) + assert added == [] + assert "r1_REV" not in {r.id for r in model.reactions} + + +def test_does_not_split_exchange_reaction_even_if_reversible(): + """Exchange reactions (one metabolite) are explicitly excluded from + the irreversibility step in MATLAB, regardless of bounds.""" + model = _build_model_with_bounds([ + ("EX_A", {"A": -1.0}, -1000.0, 1000.0), + ]) + added = convert_to_irreversible(model) + assert added == [] + ex = model.reactions.get_by_id("EX_A") + assert ex.bounds == (-1000.0, 1000.0) + + +def test_splits_multiple_mixed_reactions(): + model = _build_model_with_bounds([ + ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0), # split + ("r2", {"B": -2.0, "C": 3.0}, 0.0, 1000.0), # forward only + ("EX_A", {"A": -1.0}, -1000.0, 1000.0), # exchange + ("r3", {"C": -1.0, "D": 1.0}, -200.0, 200.0), # split + ]) + + added = convert_to_irreversible(model) + assert added == ["r1_REV", "r3_REV"] + + assert model.reactions.get_by_id("r1").bounds == (0.0, 1000.0) + assert model.reactions.get_by_id("r1_REV").bounds == (0.0, 500.0) + assert model.reactions.get_by_id("r2").bounds == (0.0, 1000.0) + assert model.reactions.get_by_id("EX_A").bounds == (-1000.0, 1000.0) + assert model.reactions.get_by_id("r3").bounds == (0.0, 200.0) + assert model.reactions.get_by_id("r3_REV").bounds == (0.0, 200.0) + + +def test_reverse_reaction_inherits_gpr(): + model = _build_model_with_bounds([ + ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0), + ]) + model.reactions.get_by_id("r1").gene_reaction_rule = "g1 and g2" + + convert_to_irreversible(model) + + rev = model.reactions.get_by_id("r1_REV") + assert rev.gene_reaction_rule == "g1 and g2" + assert {g.id for g in rev.genes} == {"g1", "g2"} + + +def test_forward_reaction_lb_is_clamped_to_zero(): + """After splitting, the original reaction should have lb = 0, + which is what MATLAB's convertToIrrev does.""" + model = _build_model_with_bounds([ + ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0), + ]) + convert_to_irreversible(model) + assert model.reactions.get_by_id("r1").lower_bound == 0.0 + + +def test_no_reverse_reaction_has_negative_bound(): + """After conversion, no non-exchange reaction may carry negative flux.""" + model = _build_model_with_bounds([ + ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0), + ("r2", {"B": -1.0, "C": 1.0}, -1000.0, 0.0), # blocked reverse + ("EX_A", {"A": -1.0}, -1000.0, 1000.0), + ]) + convert_to_irreversible(model) + for rxn in model.reactions: + if rxn.boundary: + continue + assert rxn.lower_bound >= 0, f"{rxn.id} still has lb < 0" + + +def test_returns_empty_list_when_nothing_to_split(): + model = _build_model_with_bounds([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0), + ("EX_A", {"A": -1.0}, -1000.0, 1000.0), + ]) + assert convert_to_irreversible(model) == [] + + +def test_conversion_is_idempotent_after_first_pass(): + """Running convert_to_irreversible twice should not create + `_REV_REV` reactions, because the first pass already clamped + all non-exchange lb to 0.""" + model = _build_model_with_bounds([ + ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0), + ]) + convert_to_irreversible(model) + second = convert_to_irreversible(model) + assert second == [] + assert "r1_REV_REV" not in {r.id for r in model.reactions} diff --git a/tests/test_manipulation_merge.py b/tests/test_manipulation_merge.py new file mode 100644 index 0000000..a430f6e --- /dev/null +++ b/tests/test_manipulation_merge.py @@ -0,0 +1,136 @@ +"""Tests for merge_models (mergeModels port).""" +import cobra +import pytest + +from raven_python.manipulation import add_reactions_from_equations, merge_models + + +def _model(mid, mets, reactions): + m = cobra.Model(mid) + m.add_metabolites(mets) + add_reactions_from_equations(m, reactions) + return m + + +@pytest.fixture +def model_a(): + return _model( + "A", + [ + cobra.Metabolite("glc_c", name="Glucose", compartment="c"), + cobra.Metabolite("g6p_c", name="G6P", compartment="c"), + ], + [{"id": "HEX", "equation": "glc_c --> g6p_c", "gene_reaction_rule": "GA"}], + ) + + +@pytest.fixture +def model_b(): + # same Glucose[c] compound but a DIFFERENT id + return _model( + "B", + [ + cobra.Metabolite("glucose_c", name="Glucose", compartment="c"), + cobra.Metabolite("lac_c", name="Lactate", compartment="c"), + ], + [{"id": "LDH", "equation": "glucose_c --> lac_c", "gene_reaction_rule": "GB"}], + ) + + +def test_unifies_metabolites_by_name_comp(model_a, model_b): + merged = merge_models([model_a, model_b]) + glucoses = [m for m in merged.metabolites if m.name == "Glucose" and m.compartment == "c"] + assert len(glucoses) == 1 # glc_c and glucose_c unified + # both reactions reference the same merged Glucose object + hex_glc = [m for m in merged.reactions.get_by_id("HEX").metabolites if m.name == "Glucose"][0] + ldh_glc = [m for m in merged.reactions.get_by_id("LDH").metabolites if m.name == "Glucose"][0] + assert hex_glc is ldh_glc + + +def test_match_by_id_keeps_distinct(model_a, model_b): + merged = merge_models([model_a, model_b], match_by="id") + glucoses = [m for m in merged.metabolites if m.name == "Glucose"] + assert len(glucoses) == 2 # glc_c and glucose_c are distinct by id + + +def test_all_reactions_kept(model_a, model_b): + merged = merge_models([model_a, model_b]) + assert {"HEX", "LDH"} <= {r.id for r in merged.reactions} + + +def test_reaction_id_collision_renamed(model_a): + # two models with the same reaction id but different chemistry + other = _model( + "B", + [cobra.Metabolite("glc_c", name="Glucose", compartment="c"), + cobra.Metabolite("x_c", name="X", compartment="c")], + [{"id": "HEX", "equation": "glc_c --> x_c"}], + ) + merged = merge_models([model_a, other]) + assert "HEX" in {r.id for r in merged.reactions} + assert "HEX_B" in {r.id for r in merged.reactions} # renamed with source id + + +def test_genes_merged(model_a, model_b): + merged = merge_models([model_a, model_b]) + assert {"GA", "GB"} <= {g.id for g in merged.genes} + + +def test_provenance_recorded(model_a, model_b): + merged = merge_models([model_a, model_b]) + assert merged.reactions.get_by_id("HEX").notes["origin"] == "A" + assert merged.reactions.get_by_id("LDH").notes["origin"] == "B" + assert merged.genes.get_by_id("GA").notes["origin"] == "A" + + +def test_compartments_preserved(model_a): + model_a.compartments = {"c": "cytoplasm"} + merged = merge_models([model_a, model_a.copy()]) + assert merged.compartments.get("c") == "cytoplasm" + + +def test_single_model_returns_copy(model_a): + merged = merge_models([model_a]) + assert merged is not model_a + assert {r.id for r in merged.reactions} == {r.id for r in model_a.reactions} + + +def test_three_models(model_a, model_b): + c = _model("C", [cobra.Metabolite("co2_c", name="CO2", compartment="c")], + [{"id": "SINK", "equation": "co2_c -->"}]) + merged = merge_models([model_a, model_b, c]) + assert {"HEX", "LDH", "SINK"} <= {r.id for r in merged.reactions} + + +def test_bad_match_by(model_a, model_b): + with pytest.raises(ValueError, match="match_by"): + merge_models([model_a, model_b], match_by="oops") + + +# --- regression: formula/charge conflict (known_issues.md B1) -------------- + +def test_formula_conflict_warns(): + """Two models sharing a name[comp] but with different formulas warn instead + of silently keeping the first.""" + a = _model("A", + [cobra.Metabolite("g1", name="Glucose", formula="C6H12O6", compartment="c")], + [{"id": "EX_A", "equation": "g1 -->"}]) + b = _model("B", + [cobra.Metabolite("g2", name="Glucose", formula="C6H12O7", compartment="c")], + [{"id": "EX_B", "equation": "g2 -->"}]) + with pytest.warns(UserWarning, match="different formulas"): + merged = merge_models([a, b]) + # The merge still picks the first-seen — the test asserts the warning fired + # and the model survives. + assert "EX_A" in merged.reactions and "EX_B" in merged.reactions + + +def test_charge_conflict_warns(): + a = _model("A", + [cobra.Metabolite("g1", name="Glucose", formula="C6H12O6", charge=0, compartment="c")], + [{"id": "EX_A", "equation": "g1 -->"}]) + b = _model("B", + [cobra.Metabolite("g2", name="Glucose", formula="C6H12O6", charge=-1, compartment="c")], + [{"id": "EX_B", "equation": "g2 -->"}]) + with pytest.warns(UserWarning, match="different charges"): + merge_models([a, b]) diff --git a/tests/test_manipulation_remove.py b/tests/test_manipulation_remove.py new file mode 100644 index 0000000..2b659b9 --- /dev/null +++ b/tests/test_manipulation_remove.py @@ -0,0 +1,97 @@ +"""Tests for raven_python.manipulation.remove (removeMets/removeGenes ports).""" +import cobra +import pytest + +from raven_python.manipulation import ( + add_reactions_from_equations, + remove_genes, + remove_metabolites, +) + + +@pytest.fixture +def model(): + m = cobra.Model("t") + m.add_metabolites( + [ + cobra.Metabolite("atp_c", name="ATP", compartment="c"), + cobra.Metabolite("atp_m", name="ATP", compartment="m"), + cobra.Metabolite("adp_c", name="ADP", compartment="c"), + cobra.Metabolite("x_c", name="X", compartment="c"), + ] + ) + add_reactions_from_equations( + m, + [ + {"id": "R1", "equation": "atp_c --> adp_c", "gene_reaction_rule": "G1 and G2"}, + {"id": "R2", "equation": "atp_c --> x_c", "gene_reaction_rule": "G3 or G4"}, + {"id": "R3", "equation": "atp_m --> adp_c"}, # no GPR (spontaneous) + ], + ) + return m + + +# --- remove_metabolites ---------------------------------------------------- + +def test_remove_metabolites_by_id(model): + remove_metabolites(model, ["x_c"]) + assert "x_c" not in model.metabolites + # reaction kept, just lost the metabolite + assert "R2" in model.reactions + + +def test_remove_metabolites_by_name_across_compartments(model): + # "ATP" exists in c and m; by_name removes both at once. + remove_metabolites(model, ["ATP"], by_name=True) + assert "atp_c" not in model.metabolites + assert "atp_m" not in model.metabolites + assert "adp_c" in model.metabolites + + +def test_remove_metabolites_destructive(model): + remove_metabolites(model, ["adp_c"], destructive=True) + # R1 and R3 both produced adp_c -> removed + assert "adp_c" not in model.metabolites + assert "R1" not in model.reactions and "R3" not in model.reactions + + +# --- remove_genes ---------------------------------------------------------- + +def test_remove_genes_remove_mode(model): + blocked = remove_genes(model, ["G1"], blocked_reactions="remove") + # R1 = "G1 and G2": removing G1 breaks the complex -> blocked -> removed + assert blocked == ["R1"] + assert "R1" not in model.reactions + assert "R2" in model.reactions # OR rule unaffected + + +def test_remove_genes_constrain_mode(model): + blocked = remove_genes(model, ["G1"], blocked_reactions="constrain") + assert blocked == ["R1"] + r1 = model.reactions.get_by_id("R1") + assert r1.bounds == (0, 0) # kept but constrained, per RAVEN default + assert r1.gene_reaction_rule == "" + + +def test_remove_genes_keep_mode(model): + blocked = remove_genes(model, ["G1"], blocked_reactions="keep") + assert blocked == ["R1"] + r1 = model.reactions.get_by_id("R1") + assert r1.gene_reaction_rule == "" + assert r1.bounds != (0, 0) # left untouched + + +def test_remove_genes_or_rule_not_blocked(model): + blocked = remove_genes(model, ["G3"], blocked_reactions="remove") + # R2 = "G3 or G4": removing G3 leaves G4 -> not blocked + assert blocked == [] + assert model.reactions.get_by_id("R2").gene_reaction_rule == "G4" + + +def test_remove_genes_absent_gene_is_noop(model): + assert remove_genes(model, ["NOPE"]) == [] + + +def test_remove_genes_bad_policy(model): + with pytest.raises(ValueError, match="blocked_reactions"): + remove_genes(model, ["G1"], blocked_reactions="explode") diff --git a/tests/test_manipulation_simplify.py b/tests/test_manipulation_simplify.py new file mode 100644 index 0000000..586a0c3 --- /dev/null +++ b/tests/test_manipulation_simplify.py @@ -0,0 +1,184 @@ +"""Tests for simplifyModel reduction modes.""" +import cobra +import pytest + +from raven_python.manipulation import ( + add_reactions_from_equations, + constrain_reversible_reactions, + group_linear_reactions, + remove_dead_end_reactions, + remove_duplicate_reactions, +) + +# --- remove_dead_end_reactions -------------------------------------------- + +def test_dead_end_removed(): + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b", "dead")]) + add_reactions_from_equations( + m, + [ + {"id": "R_in", "equation": " --> a"}, + {"id": "R1", "equation": "a --> b"}, + {"id": "R_out", "equation": "b --> "}, + {"id": "R_dead", "equation": "a --> dead"}, # 'dead' only produced + ], + ) + removed_rxns, removed_mets = remove_dead_end_reactions(m) + assert "R_dead" in removed_rxns + assert "dead" in removed_mets + # the productive path survives + assert {"R_in", "R1", "R_out"} <= {r.id for r in m.reactions} + + +def test_dead_end_respects_reserved(): + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "dead")]) + add_reactions_from_equations( + m, [{"id": "R_in", "equation": " --> a"}, {"id": "R_dead", "equation": "a --> dead"}] + ) + removed_rxns, _ = remove_dead_end_reactions(m, reserved=["R_dead"]) + assert "R_dead" not in removed_rxns + assert "R_dead" in {r.id for r in m.reactions} + + +# --- remove_duplicate_reactions ------------------------------------------- + +def test_duplicates_removed(): + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b")]) + add_reactions_from_equations( + m, + [ + {"id": "R1", "equation": "a --> b", "bounds": (0, 1000)}, + {"id": "R2", "equation": "a --> b", "bounds": (0, 1000)}, # duplicate of R1 + {"id": "R3", "equation": "a --> b", "bounds": (0, 500)}, # different bounds + ], + ) + removed = remove_duplicate_reactions(m) + assert len(removed) == 1 # one of R1/R2 removed + assert {"R3"} <= {r.id for r in m.reactions} + assert sum(r.id in ("R1", "R2") for r in m.reactions) == 1 + + +def test_duplicates_keep_reserved(): + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b")]) + add_reactions_from_equations( + m, + [ + {"id": "R1", "equation": "a --> b", "bounds": (0, 1000)}, + {"id": "R2", "equation": "a --> b", "bounds": (0, 1000)}, + ], + ) + remove_duplicate_reactions(m, reserved=["R1"]) + assert "R1" in {r.id for r in m.reactions} # reserved one kept + + +# --- constrain_reversible_reactions --------------------------------------- + +def test_forward_only_reversible_constrained(): + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b")]) + add_reactions_from_equations( + m, + [ + {"id": "R_in", "equation": " --> a", "bounds": (0, 1000)}, + {"id": "R1", "equation": "a <=> b", "bounds": (-1000, 1000)}, # can only go fwd + {"id": "R_out", "equation": "b --> ", "bounds": (0, 1000)}, + ], + ) + changed = constrain_reversible_reactions(m) + assert "R1" in changed + assert m.reactions.get_by_id("R1").lower_bound == 0 # constrained to forward + + +def test_truly_reversible_unchanged(): + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b")]) + add_reactions_from_equations( + m, + [ + {"id": "R_in", "equation": " <=> a", "bounds": (-1000, 1000)}, + {"id": "R1", "equation": "a <=> b", "bounds": (-1000, 1000)}, + {"id": "R_out", "equation": "b <=> ", "bounds": (-1000, 1000)}, + ], + ) + changed = constrain_reversible_reactions(m) + assert "R1" not in changed # can go both ways + + +# --- group_linear_reactions ----------------------------------------------- + +def test_linear_chain_merged(): + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b", "c")]) + add_reactions_from_equations( + m, + [ + {"id": "R1", "equation": "a --> b"}, # b: single producer + {"id": "R2", "equation": "b --> c"}, # b: single consumer + ], + ) + n_before = len(m.reactions) + group_linear_reactions(m) + # b is eliminated; R1+R2 merged into one reaction a --> c + assert "b" not in m.metabolites + assert len(m.reactions) < n_before + + +def test_group_linear_discards_genes(): + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b", "c")]) + add_reactions_from_equations( + m, + [ + {"id": "R1", "equation": "a --> b", "gene_reaction_rule": "G1"}, + {"id": "R2", "equation": "b --> c", "gene_reaction_rule": "G2"}, + ], + ) + group_linear_reactions(m) + assert len(m.genes) == 0 + + +# --- regression: incremental merge collapses a long chain (known_issues.md D1) --- + +def test_group_linear_merges_long_chain_in_one_pass(): + """The incremental scan still flattens a 5-reaction linear chain — the + correctness property the original O(n²·m) restart-after-merge loop had.""" + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in "abcdef"]) + add_reactions_from_equations( + m, + [ + {"id": "R_in", "equation": " --> a"}, + {"id": "R1", "equation": "a --> b"}, + {"id": "R2", "equation": "b --> c"}, + {"id": "R3", "equation": "c --> d"}, + {"id": "R4", "equation": "d --> e"}, + {"id": "R5", "equation": "e --> f"}, + {"id": "R_out", "equation": "f --> "}, + ], + ) + group_linear_reactions(m) + # All the chain's internal metabolites are gone. + assert {x for x in m.metabolites if x.id in {"b", "c", "d", "e"}} == set() + + +# --- regression: NaN FVA on infeasible model (known_issues.md C1) ---------- + +def test_constrain_reversible_raises_on_infeasible(): + """An infeasible model produces NaN FVA ranges; the old abs(NaN) < eps + check silently treated those as 'truly reversible'. Now raises.""" + m = cobra.Model("t") + a, b = (cobra.Metabolite(x, compartment="c") for x in ("a", "b")) + m.add_metabolites([a, b]) + # Force a contradiction: r requires production AND consumption of a, but + # nothing else produces a. + r = cobra.Reaction("r", lower_bound=-1, upper_bound=1) + r.add_metabolites({a: -1, b: 1}) + forced = cobra.Reaction("forced", lower_bound=5, upper_bound=10) # infeasible + forced.add_metabolites({a: -1}) + m.add_reactions([r, forced]) + with pytest.raises(RuntimeError, match="infeasible"): + constrain_reversible_reactions(m) diff --git a/tests/test_manipulation_transfer.py b/tests/test_manipulation_transfer.py new file mode 100644 index 0000000..61c2ac9 --- /dev/null +++ b/tests/test_manipulation_transfer.py @@ -0,0 +1,137 @@ +"""Tests for add_reactions_from_model (addRxnsGenesMets port).""" +import cobra +import pytest + +from raven_python.manipulation import add_reactions_from_equations, add_reactions_from_model + + +@pytest.fixture +def draft(): + m = cobra.Model("draft") + m.add_metabolites( + [cobra.Metabolite("glc_c", name="Glucose", formula="C6H12O6", compartment="c")] + ) + # an existing reaction so glc_c is in use and we have an id to test skipping + add_reactions_from_equations(m, [{"id": "R_existing", "equation": "glc_c <=>"}]) + return m + + +@pytest.fixture +def source(): + m = cobra.Model("source") + m.add_metabolites( + [ + # same name[comp] as draft's glc_c but a DIFFERENT id + cobra.Metabolite("glucose_c", name="Glucose", formula="C6H12O6", compartment="c"), + cobra.Metabolite("atp_c", name="ATP", formula="C10H16N5O13P3", charge=-4, compartment="c"), + cobra.Metabolite("g6p_c", name="G6P", formula="C6H13O9P", compartment="c"), + ] + ) + add_reactions_from_equations( + m, + [ + { + "id": "HEX", + "equation": "glucose_c + atp_c --> g6p_c", + "name": "hexokinase", + "bounds": (0, 1000), + "gene_reaction_rule": "G1", + "subsystem": "glycolysis", + }, + {"id": "R_existing", "equation": "glucose_c <=>"}, # id already in draft + ], + ) + return m + + +def test_metabolite_matched_by_name_comp_not_id(draft, source): + add_reactions_from_model(draft, source, "HEX") + hex_rxn = draft.reactions.get_by_id("HEX") + # Glucose reused from the draft (id glc_c), NOT the source's glucose_c + assert "glc_c" in {m.id for m in hex_rxn.metabolites} + assert "glucose_c" not in draft.metabolites + + +def test_new_metabolites_added_with_metadata(draft, source): + add_reactions_from_model(draft, source, "HEX") + assert "atp_c" in draft.metabolites and "g6p_c" in draft.metabolites + assert draft.metabolites.get_by_id("g6p_c").formula == "C6H13O9P" + assert draft.metabolites.get_by_id("atp_c").charge == -4 + + +def test_reaction_copied_with_bounds_and_name(draft, source): + (rxn,) = add_reactions_from_model(draft, source, "HEX") + assert rxn.id == "HEX" + assert rxn.name == "hexokinase" + assert rxn.bounds == (0, 1000) + assert rxn.subsystem == "glycolysis" + assert {m.id: rxn.get_coefficient(m.id) for m in rxn.metabolites} == { + "glc_c": -1.0, + "atp_c": -1.0, + "g6p_c": 1.0, + } + + +def test_genes_true_copies_gpr_and_creates_genes(draft, source): + add_reactions_from_model(draft, source, "HEX", genes=True) + assert draft.reactions.get_by_id("HEX").gene_reaction_rule == "G1" + assert "G1" in draft.genes + + +def test_genes_false_no_gpr(draft, source): + add_reactions_from_model(draft, source, "HEX", genes=False) + assert draft.reactions.get_by_id("HEX").gene_reaction_rule == "" + + +def test_genes_string_override(draft, source): + add_reactions_from_model(draft, source, "HEX", genes="G9 or G10") + assert draft.reactions.get_by_id("HEX").gene_reaction_rule == "G9 or G10" + + +def test_skips_already_present(draft, source): + added = add_reactions_from_model(draft, source, ["HEX", "R_existing"]) + assert [r.id for r in added] == ["HEX"] + + +def test_all_present_raises(draft, source): + with pytest.raises(ValueError, match="already in the model"): + add_reactions_from_model(draft, source, "R_existing") + + +def test_unknown_source_reaction_raises(draft, source): + with pytest.raises(ValueError, match="not found in the source model"): + add_reactions_from_model(draft, source, "NOPE") + + +def test_note_and_confidence_stored(draft, source): + (rxn,) = add_reactions_from_model(draft, source, "HEX", note="from KEGG", confidence=2) + assert rxn.notes["note"] == "from KEGG" + assert rxn.notes["confidence_score"] == 2 + + +# --- regression: intra-batch met-id minting collision (known_issues.md A3) --- + +def test_intra_batch_id_minting_unique(): + """Two source mets whose ids both collide with the draft and whose name[comp] + differs both get routed through new-id minting. The fix tracks ids minted in + the current batch so the two don't collapse to the same generated id.""" + draft = cobra.Model("draft") + draft.add_metabolites([ + cobra.Metabolite("atp_c", name="ATP-draft", compartment="c"), + cobra.Metabolite("adp_c", name="ADP-draft", compartment="c"), + ]) + source = cobra.Model("source") + source.add_metabolites([ + cobra.Metabolite("atp_c", name="ATP-source", compartment="c"), + cobra.Metabolite("adp_c", name="ADP-source", compartment="c"), + ]) + rxn = cobra.Reaction("R1", lower_bound=0, upper_bound=1000) + source.add_reactions([rxn]) + rxn.add_metabolites({ + source.metabolites.get_by_id("atp_c"): -1, + source.metabolites.get_by_id("adp_c"): 1, + }) + add_reactions_from_model(draft, source, "R1") + # Both source mets minted distinct ids (m1 and m2) — not a collision. + new_ids = sorted(m.id for m in draft.metabolites if m.id not in ("atp_c", "adp_c")) + assert len(new_ids) == 2 and len(set(new_ids)) == 2 diff --git a/tests/test_manipulation_transport.py b/tests/test_manipulation_transport.py new file mode 100644 index 0000000..e8fb2b6 --- /dev/null +++ b/tests/test_manipulation_transport.py @@ -0,0 +1,98 @@ +"""Tests for add_transport_reactions (addTransport port).""" +import cobra +import pytest + +from raven_python.manipulation import add_transport_reactions + + +@pytest.fixture +def model(): + m = cobra.Model("t") + m.compartments = {"c": "cytoplasm", "m": "mitochondrion", "e": "extracellular"} + m.add_metabolites( + [ + cobra.Metabolite("atp_c", name="ATP", formula="C10H16N5O13P3", charge=-4, compartment="c"), + cobra.Metabolite("h2o_c", name="H2O", formula="H2O", compartment="c"), + cobra.Metabolite("atp_m", name="ATP", compartment="m"), # exists in m + ] + ) + return m + + +def test_basic_transport_to_existing(model): + added = add_transport_reactions(model, "c", "m", ["ATP"]) + assert len(added) == 1 + rxn = added[0] + assert rxn.id == "tr_0001" + assert rxn.name == "ATP transport, cytoplasm-mitochondrion" + assert {m.id: rxn.get_coefficient(m.id) for m in rxn.metabolites} == { + "atp_c": -1.0, + "atp_m": 1.0, + } + assert rxn.reversibility is True + + +def test_only_to_existing_skips_missing(model): + # H2O is not in m; with only_to_existing (default) it's skipped + added = add_transport_reactions(model, "c", "m", ["ATP", "H2O"]) + assert [r.id for r in added] == ["tr_0001"] # only ATP + + +def test_creates_missing_target_metabolite(model): + added = add_transport_reactions( + model, "c", "m", ["H2O"], only_to_existing=False + ) + assert len(added) == 1 + new = [mt for mt in model.metabolites if mt.name == "H2O" and mt.compartment == "m"] + assert len(new) == 1 + assert new[0].formula == "H2O" # copied from source + + +def test_copies_formula_and_charge(model): + add_transport_reactions(model, "c", "e", ["ATP"], only_to_existing=False) + new = [mt for mt in model.metabolites if mt.name == "ATP" and mt.compartment == "e"][0] + assert new.formula == "C10H16N5O13P3" + assert new.charge == -4 + + +def test_irreversible(model): + (rxn,) = add_transport_reactions(model, "c", "m", ["ATP"], reversible=False) + assert rxn.lower_bound == 0 + assert rxn.reversibility is False + + +def test_default_all_metabolites_in_from(model): + # default metabolite_names = all in c (ATP, H2O); to m, only_to_existing -> only ATP + added = add_transport_reactions(model, "c", "m") + assert [r.id for r in added] == ["tr_0001"] + + +def test_multiple_target_compartments_and_sequential_ids(model): + added = add_transport_reactions( + model, "c", ["m", "e"], ["ATP"], only_to_existing=False + ) + assert [r.id for r in added] == ["tr_0001", "tr_0002"] + + +def test_unknown_compartment_raises(model): + with pytest.raises(ValueError, match="not in the model"): + add_transport_reactions(model, "x", "m", ["ATP"]) + + +def test_unknown_metabolite_raises(model): + with pytest.raises(ValueError, match="not found in compartment"): + add_transport_reactions(model, "c", "m", ["NOPE"]) + + +# --- regression: duplicate name in compartment (known_issues.md A4) -------- + +def test_duplicate_name_in_source_compartment_warns(model): + """Two source mets sharing a name in the same compartment warn instead of + silently collapsing — previously one was dropped from the lookup dict.""" + model.add_metabolites([ + cobra.Metabolite("h2o2_c", name="H2O", compartment="c"), # duplicate name + ]) + with pytest.warns(UserWarning, match="Multiple metabolites named 'H2O'"): + added = add_transport_reactions(model, "c", "m", ["H2O"], only_to_existing=False) + # Transport still works (uses the first match) — the warning is the signal. + assert len(added) == 1 diff --git a/tests/test_omics_hpa.py b/tests/test_omics_hpa.py new file mode 100644 index 0000000..8b2c124 --- /dev/null +++ b/tests/test_omics_hpa.py @@ -0,0 +1,154 @@ +"""Tests for omics/hpa.py — HPA parsing + score adapters (Phase 5).""" +from __future__ import annotations + +from textwrap import dedent + +import pytest + +from raven_python.omics import ( + HPA_LEVEL_SCORES, + HPAData, + HPARnaData, + hpa_gene_scores, + parse_hpa, + parse_hpa_rna, + rna_gene_scores, +) + + +@pytest.fixture +def hpa_tsv(tmp_path): + """Minimal HPA proteomics TSV with two genes × two tissues × two cell types.""" + p = tmp_path / "hpa.tsv" + p.write_text(dedent("""\ + Gene\tGene name\tTissue\tCell type\tLevel\tReliability + ENSG1\tGeneA\tliver\thepatocytes\tHigh\tEnhanced + ENSG1\tGeneA\tliver\tbile duct cells\tLow\tApproved + ENSG1\tGeneA\tkidney\ttubular cells\tNot detected\tApproved + ENSG2\tGeneB\tliver\thepatocytes\tMedium\tSupported + ENSG2\tGeneB\tkidney\ttubular cells\tHigh\tEnhanced + ENSG3\tGeneC\tliver\thepatocytes\tMixed\tUncertain + """)) + return p + + +@pytest.fixture +def rna_tsv(tmp_path): + """Tidy HPA-style RNA-seq TSV (Gene/Gene name/Tissue/TPM).""" + p = tmp_path / "rna.tsv" + p.write_text(dedent("""\ + Gene\tGene name\tTissue\tTPM + ENSG1\tGeneA\tliver\t100.0 + ENSG1\tGeneA\tkidney\t10.0 + ENSG2\tGeneB\tliver\t5.0 + ENSG2\tGeneB\tkidney\t50.0 + """)) + return p + + +# ---------------------------------------------------------------------- parsers + +def test_parse_hpa_basic(hpa_tsv): + hpa = parse_hpa(hpa_tsv) + assert isinstance(hpa, HPAData) + assert hpa.tissues() == ["kidney", "liver"] + assert hpa.celltypes("liver") == ["bile duct cells", "hepatocytes"] + # one row per (gene, tissue, celltype): + assert len(hpa.df) == 6 + assert set(hpa.df.columns) == {"gene_id", "gene_name", "tissue", "celltype", + "level", "reliability"} + + +def test_parse_hpa_missing_columns(tmp_path): + p = tmp_path / "bad.tsv" + p.write_text("Gene\tTissue\nx\ty\n") + with pytest.raises(ValueError, match="missing HPA columns"): + parse_hpa(p) + + +def test_parse_hpa_rna_tidy(rna_tsv): + rna = parse_hpa_rna(rna_tsv) + assert isinstance(rna, HPARnaData) + assert rna.tissues() == ["kidney", "liver"] + assert rna.expression("liver") == {"ENSG1": 100.0, "ENSG2": 5.0} + + +def test_parse_hpa_rna_wide_layout(tmp_path): + """The older wide layout (one TPM column per tissue) is melted to the tidy form.""" + p = tmp_path / "rna_wide.tsv" + p.write_text(dedent("""\ + Gene\tGene name\tliver\tkidney + ENSG1\tGeneA\t100\t10 + ENSG2\tGeneB\t5\t50 + """)) + rna = parse_hpa_rna(p) + assert rna.expression("liver") == {"ENSG1": 100.0, "ENSG2": 5.0} + assert rna.expression("kidney") == {"ENSG1": 10.0, "ENSG2": 50.0} + + +# ---------------------------------------------------------------------- scoring + +def test_hpa_gene_scores_best_picks_max(hpa_tsv): + """In liver, ENSG1 is High (hepatocytes) + Low (bile duct) → best = 20.""" + g = hpa_gene_scores(parse_hpa(hpa_tsv), "liver", multiple_celltype="best") + assert g["ENSG1"] == HPA_LEVEL_SCORES["High"] # 20 + assert g["ENSG2"] == HPA_LEVEL_SCORES["Medium"] # 15 + + +def test_hpa_gene_scores_average(hpa_tsv): + """Average across cell types: ENSG1 in liver = mean(20, 10) = 15.""" + g = hpa_gene_scores(parse_hpa(hpa_tsv), "liver", multiple_celltype="average") + assert g["ENSG1"] == pytest.approx(15.0) + + +def test_hpa_gene_scores_celltype_filter(hpa_tsv): + """Restricting to a celltype gives only that celltype's score.""" + g = hpa_gene_scores(parse_hpa(hpa_tsv), "liver", celltype="bile duct cells") + assert g == {"ENSG1": HPA_LEVEL_SCORES["Low"]} # 10; GeneB has no bile-duct row + + +def test_hpa_gene_scores_unknown_level_omitted(hpa_tsv): + """A 'Mixed' / 'N/A' level is not in HPA_LEVEL_SCORES and is dropped (not -inf).""" + g = hpa_gene_scores(parse_hpa(hpa_tsv), "liver") + assert "ENSG3" not in g # the only ENSG3 row in liver has level='Mixed' + + +def test_hpa_gene_scores_unknown_celltype_returns_empty(hpa_tsv): + g = hpa_gene_scores(parse_hpa(hpa_tsv), "liver", celltype="cardiomyocytes") + assert g == {} + + +def test_hpa_gene_scores_custom_level_table(hpa_tsv): + """``level_scores`` overrides the default mapping.""" + g = hpa_gene_scores(parse_hpa(hpa_tsv), "liver", + level_scores={"High": 1.0, "Medium": 0.5, "Low": 0.1, "Not detected": -1.0}) + assert g == {"ENSG1": 1.0, "ENSG2": 0.5} + + +def test_rna_gene_scores_against_per_gene_mean(rna_tsv): + """Default reference is per-gene cross-tissue mean (RAVEN arrayData.threshold default). + + ENSG1 liver TPM=100, mean across tissues=55 → log(100/55) > 0 → positive score. + ENSG2 liver TPM=5, mean=27.5 → log(5/27.5) < 0 → negative score. + """ + g = rna_gene_scores(parse_hpa_rna(rna_tsv), "liver") + assert g["ENSG1"] > 0 + assert g["ENSG2"] < 0 + + +def test_rna_gene_scores_scalar_reference(rna_tsv): + """A scalar reference applies to all genes (and reuses gene_scores_from_expression).""" + g = rna_gene_scores(parse_hpa_rna(rna_tsv), "liver", reference=10.0) + # ENSG1 TPM=100, ref=10 → ln(10)*5 ≈ 11.5 → clamped to max_score=10. + assert g["ENSG1"] == 10.0 + assert g["ENSG2"] < 0 # TPM=5 < ref=10 + + +def test_rna_gene_scores_unknown_tissue_raises(rna_tsv): + with pytest.raises(ValueError, match="not in dataset"): + rna_gene_scores(parse_hpa_rna(rna_tsv), "spleen") + + +def test_hpa_gene_scores_invalid_multiple_celltype(hpa_tsv): + with pytest.raises(ValueError, match="multiple_celltype"): + hpa_gene_scores(parse_hpa(hpa_tsv), "liver", multiple_celltype="weighted") diff --git a/tests/test_parameters.py b/tests/test_parameters.py new file mode 100644 index 0000000..c0ab06c --- /dev/null +++ b/tests/test_parameters.py @@ -0,0 +1,60 @@ +"""Tests for set_variance_bounds (the var mode of setParam).""" +import cobra +import pytest + +from raven_python.manipulation import add_reactions_from_equations, set_variance_bounds + + +@pytest.fixture +def model(): + m = cobra.Model("t") + m.add_metabolites( + [cobra.Metabolite("a_c", compartment="c"), cobra.Metabolite("b_c", compartment="c")] + ) + add_reactions_from_equations( + m, + [ + {"id": "R1", "equation": "a_c <=> b_c"}, + {"id": "R2", "equation": "a_c <=> b_c"}, + ], + ) + return m + + +def test_band_positive(model): + set_variance_bounds(model, "R1", 100, 5) # 97.5 .. 102.5 + lb, ub = model.reactions.get_by_id("R1").bounds + assert lb == pytest.approx(97.5) + assert ub == pytest.approx(102.5) + + +def test_band_negative_is_ordered(model): + set_variance_bounds(model, "R1", -100, 5) + lb, ub = model.reactions.get_by_id("R1").bounds + assert lb == pytest.approx(-102.5) + assert ub == pytest.approx(-97.5) + assert lb <= ub + + +def test_broadcast_scalar(model): + set_variance_bounds(model, ["R1", "R2"], 50, 10) + for rid in ("R1", "R2"): + lb, ub = model.reactions.get_by_id(rid).bounds + assert lb == pytest.approx(47.5) + assert ub == pytest.approx(52.5) + + +def test_per_reaction_values(model): + set_variance_bounds(model, ["R1", "R2"], [100, 200], 0) + assert model.reactions.get_by_id("R1").bounds == pytest.approx((100, 100)) + assert model.reactions.get_by_id("R2").bounds == pytest.approx((200, 200)) + + +def test_length_mismatch_raises(model): + with pytest.raises(ValueError, match="to match the reactions"): + set_variance_bounds(model, ["R1", "R2"], [1, 2, 3], 5) + + +def test_unknown_reaction_raises(model): + with pytest.raises(ValueError, match="not found"): + set_variance_bounds(model, "NOPE", 1, 5) diff --git a/tests/test_reconstruction_blast.py b/tests/test_reconstruction_blast.py new file mode 100644 index 0000000..32af556 --- /dev/null +++ b/tests/test_reconstruction_blast.py @@ -0,0 +1,78 @@ +"""Tests for run_blast / run_diamond / blast_from_table + the tabular parser.""" +import shutil + +import pandas as pd +import pytest + +from raven_python.reconstruction.homology import HIT_COLUMNS, blast_from_table, run_blast +from raven_python.reconstruction.homology.blast import _parse_tabular + +_SEQ = ( + "MSTNPKPQRKTKRNTNRRPQDVKFPGGGQIVGGVYLLPRRGPRLGVRATRKTSERSQPRGRRQPIPKARRPEGRTWAQPGYPWPLYGNEGCGWAGWLLSPRG" +) + + +def test_parse_tabular_csv(): + text = "tg1,ng1,1e-50,99.0,120,250.0,99.5\ntg2,ng2,0.0,100.0,200,400.0,100.0\n" + df = _parse_tabular(text, "templ", "org", sep=",") + assert list(df.columns) == HIT_COLUMNS + assert df.iloc[0].from_gene == "tg1" and df.iloc[0].to_gene == "ng1" + assert df.iloc[0].from_id == "templ" and df.iloc[0].to_id == "org" + assert df.iloc[1].identity == 100.0 and df.iloc[1].align_len == 200 + + +def test_parse_tabular_empty(): + assert _parse_tabular("", "a", "b", sep=",").empty + + +def test_blast_from_table_dataframe_roundtrip(): + df = pd.DataFrame( + [["templ", "org", "tg1", "ng1", 0.0, 100.0, 100, 200.0, 100.0]], + columns=HIT_COLUMNS + ["extra"][:0], # exactly HIT_COLUMNS + ) + out = blast_from_table(df) + assert list(out.columns) == HIT_COLUMNS + assert len(out) == 1 + + +def test_blast_from_table_csv(tmp_path): + p = tmp_path / "hits.csv" + pd.DataFrame( + [["templ", "org", "tg1", "ng1", 0.0, 100.0, 100, 200.0, 100.0]], columns=HIT_COLUMNS + ).to_csv(p, index=False) + out = blast_from_table(p) + assert out.iloc[0].from_gene == "tg1" + + +def test_blast_from_table_missing_columns(): + with pytest.raises(ValueError, match="missing required columns"): + blast_from_table(pd.DataFrame({"from_id": ["x"]})) + + +def test_blast_from_table_csv_numeric_gene_ids_stay_str(tmp_path): + """All-numeric gene ids (e.g. Entrez) read as str, so they match model gene ids.""" + p = tmp_path / "hits.csv" + pd.DataFrame( + [["templ", "org", 125, 4790, 0.0, 100.0, 100, 200.0, 100.0]], columns=HIT_COLUMNS + ).to_csv(p, index=False) + out = blast_from_table(p) + assert out.iloc[0].from_gene == "125" and out.iloc[0].to_gene == "4790" + + +@pytest.mark.skipif( + not (shutil.which("blastp") and shutil.which("makeblastdb")), reason="BLAST+ not installed" +) +def test_run_blast_integration(tmp_path): + org = tmp_path / "org.faa" + ref = tmp_path / "templ.faa" + org.write_text(f">ngene\n{_SEQ}\n") + ref.write_text(f">tgene\n{_SEQ}\n") # identical sequence -> strong reciprocal hit + + hits = run_blast("org", org, ["templ"], [ref]) + assert list(hits.columns) == HIT_COLUMNS + assert not hits.empty + # both directions present + assert {("templ", "org"), ("org", "templ")} <= set(zip(hits.from_id, hits.to_id, strict=False)) + # the reciprocal pair tgene<->ngene is found + fwd = hits[(hits.from_gene == "tgene") & (hits.to_gene == "ngene")] + assert not fwd.empty diff --git a/tests/test_reconstruction_homology.py b/tests/test_reconstruction_homology.py new file mode 100644 index 0000000..63ed72f --- /dev/null +++ b/tests/test_reconstruction_homology.py @@ -0,0 +1,138 @@ +"""Tests for homology reconstruction core (make_ortholog_hits + get_model_from_homology).""" +import cobra +import pandas as pd +import pytest + +from raven_python.manipulation import add_reactions_from_equations +from raven_python.reconstruction.homology import ( + HIT_COLUMNS, + get_model_from_homology, + make_ortholog_hits, +) + +# --- make_ortholog_hits ---------------------------------------------------- + +def test_make_ortholog_hits_bidirectional(): + hits = make_ortholog_hits([("tA", "nA"), ("tB", "nB")], "template", "neworg") + assert list(hits.columns) == HIT_COLUMNS + assert len(hits) == 4 # 2 pairs x 2 directions + fwd = hits[(hits.from_id == "template") & (hits.from_gene == "tA")] + assert fwd.iloc[0].to_gene == "nA" + rev = hits[(hits.from_id == "neworg") & (hits.from_gene == "nA")] + assert rev.iloc[0].to_gene == "tA" + + +def test_make_ortholog_hits_empty_raises(): + with pytest.raises(ValueError, match="empty"): + make_ortholog_hits([], "t", "n") + + +# --- template model fixture ------------------------------------------------ + +def _template(): + m = cobra.Model("templateGEM") + m.compartments = {"c": "cytoplasm"} + m.add_metabolites([cobra.Metabolite(x, name=x.upper(), compartment="c") for x in ("a", "b", "d")]) + add_reactions_from_equations( + m, + [ + {"id": "R_single", "equation": "a --> b", "gene_reaction_rule": "tg1"}, + {"id": "R_iso", "equation": "b --> d", "gene_reaction_rule": "tg2 or tg3"}, + {"id": "R_cplx", "equation": "a --> d", "gene_reaction_rule": "tg4 and tg5"}, + ], + ) + return m + + +# --- one-to-one transfer --------------------------------------------------- + +def test_single_gene_reaction_transferred(): + t = _template() + hits = make_ortholog_hits([("tg1", "ng1")], "templateGEM", "bug") + res = get_model_from_homology([t], hits, "bug") + assert res.model.id == "bug" + assert "R_single" in {r.id for r in res.model.reactions} + assert res.model.reactions.get_by_id("R_single").gene_reaction_rule == "ng1" + + +def test_unsupported_reaction_dropped(): + t = _template() + hits = make_ortholog_hits([("tg1", "ng1")], "templateGEM", "bug") # only tg1 mapped + res = get_model_from_homology([t], hits, "bug") + # R_iso (tg2/tg3) and R_cplx (tg4/tg5) have no ortholog -> dropped + assert {r.id for r in res.model.reactions} == {"R_single"} + + +def test_one_to_many_orthologs_become_or(): + t = _template() + hits = make_ortholog_hits([("tg1", "ngA"), ("tg1", "ngB")], "templateGEM", "bug") + res = get_model_from_homology([t], hits, "bug") + assert res.model.reactions.get_by_id("R_single").gene_reaction_rule == "ngA or ngB" + + +# --- isozyme (OR) handling ------------------------------------------------- + +def test_isozyme_branch_without_ortholog_dropped(): + t = _template() + hits = make_ortholog_hits([("tg2", "ng2")], "templateGEM", "bug") # only one isozyme maps + res = get_model_from_homology([t], hits, "bug") + assert res.model.reactions.get_by_id("R_iso").gene_reaction_rule == "ng2" + + +# --- complex (AND) policies ------------------------------------------------ + +def _complex_hits(): + # only tg4 of the tg4-and-tg5 complex has an ortholog + return make_ortholog_hits([("tg4", "ng4")], "templateGEM", "bug") + + +def test_complex_policy_flag_keeps_old_marker(): + res = get_model_from_homology([_template()], _complex_hits(), "bug", complex_policy="flag") + gpr = res.model.reactions.get_by_id("R_cplx").gene_reaction_rule + assert "ng4" in gpr and "OLD_templateGEM_tg5" in gpr and " and " in gpr + + +def test_complex_policy_keep_drops_unmapped_subunit(): + res = get_model_from_homology([_template()], _complex_hits(), "bug", complex_policy="keep") + assert res.model.reactions.get_by_id("R_cplx").gene_reaction_rule == "ng4" + + +def test_complex_policy_drop_removes_reaction(): + res = get_model_from_homology([_template()], _complex_hits(), "bug", complex_policy="drop") + assert "R_cplx" not in {r.id for r in res.model.reactions} + + +# --- strictness alias + bidirectional -------------------------------------- + +def test_strictness_alias_maps_params(): + t = _template() + hits = make_ortholog_hits([("tg1", "ng1")], "templateGEM", "bug") + res = get_model_from_homology([t], hits, "bug", strictness=3) # bidir + best-hits + assert "R_single" in {r.id for r in res.model.reactions} + + +def test_one_directional_non_reciprocal(): + # build hits with only the new->old direction present + hits = make_ortholog_hits([("tg1", "ng1")], "templateGEM", "bug") + one_way = hits[hits.from_id == "bug"] # drop the template->new rows + t = _template() + # bidirectional default would find nothing; one-directional should map + assert "R_single" not in {r.id for r in get_model_from_homology([t], one_way, "bug").model.reactions} + res = get_model_from_homology([t], one_way, "bug", bidirectional=False, map_direction="new_to_old") + assert "R_single" in {r.id for r in res.model.reactions} + + +# --- preferred order ------------------------------------------------------- + +def test_preferred_order_routes_gene_to_one_model(): + t1 = _template() + t1.id = "modelA" + t2 = _template() + t2.id = "modelB" + hits1 = make_ortholog_hits([("tg1", "ng1")], "modelA", "bug") + hits2 = make_ortholog_hits([("tg1", "ng1")], "modelB", "bug") + hits = pd.concat([hits1, hits2], ignore_index=True) + res = get_model_from_homology([t1, t2], hits, "bug", preferred_order=["modelA", "modelB"]) + # ng1's reaction comes only from modelA + sources = {r.notes.get("homology_source") for r in res.model.reactions if r.id.startswith("R_single")} + assert sources == {"modelA"} diff --git a/tests/test_reconstruction_kegg_download.py b/tests/test_reconstruction_kegg_download.py new file mode 100644 index 0000000..38d2f44 --- /dev/null +++ b/tests/test_reconstruction_kegg_download.py @@ -0,0 +1,125 @@ +"""Tests for the KEGG download/arrange tooling (reconstruction/kegg/download.py). + +The network fetch needs a paid KEGG subscription, so it is not exercised here. +We test credential resolution and the network-free extract/arrange core against +hand-built fake archives. +""" +import gzip +import io +import tarfile +from pathlib import Path + +import pytest + +from raven_python.reconstruction.kegg.download import ( + _resolve_auth, + extract_kegg_dump, +) + + +def _make_targz(path: Path, members: dict[str, bytes]) -> None: + with tarfile.open(path, "w:gz") as tar: + for name, data in members.items(): + info = tarfile.TarInfo(name) + info.size = len(data) + tar.addfile(info, io.BytesIO(data)) + + +def _make_gz(path: Path, data: bytes) -> None: + with gzip.open(path, "wb") as fh: + fh.write(data) + + +# --------------------------------------------------------------------------- # +# Credentials +# --------------------------------------------------------------------------- # +def test_resolve_auth_explicit_wins(): + assert _resolve_auth("ftp.kegg.net", auth=("u", "p")) == ("u", "p") + + +def test_resolve_auth_from_netrc(tmp_path): + netrc_file = tmp_path / ".netrc" + netrc_file.write_text("machine ftp.kegg.net login alice password s3cret\n") + netrc_file.chmod(0o600) + assert _resolve_auth("ftp.kegg.net", netrc_path=netrc_file) == ("alice", "s3cret") + + +def test_resolve_auth_missing_file(tmp_path): + with pytest.raises(FileNotFoundError, match="does not exist"): + _resolve_auth("ftp.kegg.net", netrc_path=tmp_path / "nope") + + +def test_resolve_auth_host_absent(tmp_path): + netrc_file = tmp_path / ".netrc" + netrc_file.write_text("machine other.host login a password b\n") + netrc_file.chmod(0o600) + with pytest.raises(ValueError, match="No credentials for"): + _resolve_auth("ftp.kegg.net", netrc_path=netrc_file) + + +# --------------------------------------------------------------------------- # +# Extract / arrange +# --------------------------------------------------------------------------- # +@pytest.fixture +def fake_dump(tmp_path): + """A tmp dir populated with fake KEGG archives, as fetch would leave them.""" + _make_targz( + tmp_path / "reaction.tar.gz", + { + "reaction/reaction": b"RXN_ENTRIES\n", + "reaction/reaction.lst": b"R00010: A <=> B\n", + "reaction/reaction_mapformula.lst": b"R00010: 00010: A => B\n", + "reaction/reaction.name": b"discard me\n", # extra file, not lifted + }, + ) + _make_targz( + tmp_path / "compound.tar.gz", + {"compound/compound": b"CPD\n", "compound/compound.inchi": b"C00031\tInChI=x\n"}, + ) + _make_targz(tmp_path / "glycan.tar.gz", {"glycan/glycan": b"GLY\n"}) + _make_targz(tmp_path / "ko.tar.gz", {"ko/ko": b"KO\n"}) + _make_gz(tmp_path / "eukaryotes.pep.gz", b">euk\nMKV\n") + _make_gz(tmp_path / "prokaryotes.pep.gz", b">prok\nMAA\n") + (tmp_path / "taxonomy").write_text("tax\n") + return tmp_path + + +def test_extract_produces_flat_layout(fake_dump): + result = extract_kegg_dump(fake_dump) + expected = { + "reaction", + "reaction.lst", + "reaction_mapformula.lst", + "compound", + "compound.inchi", + "ko", + "genes.pep", + "taxonomy", + } + assert set(result) == expected + assert all(p.is_file() for p in result.values()) + + +def test_extract_concatenates_compound_and_glycan(fake_dump): + extract_kegg_dump(fake_dump) + assert (fake_dump / "compound").read_bytes() == b"CPD\nGLY\n" + + +def test_extract_concatenates_proteomes(fake_dump): + extract_kegg_dump(fake_dump) + assert (fake_dump / "genes.pep").read_bytes() == b">euk\nMKV\n>prok\nMAA\n" + + +def test_extract_removes_subdirs_and_archives(fake_dump): + extract_kegg_dump(fake_dump) + assert not list(fake_dump.glob("*.tar.gz")) + assert not list(fake_dump.glob("*.gz")) + for subdir in ("reaction", "compound", "glycan", "ko"): + assert not (fake_dump / subdir).is_dir() + assert not (fake_dump / "reaction.name").exists() # extra file discarded + + +def test_extract_requires_core_archives(tmp_path): + _make_targz(tmp_path / "compound.tar.gz", {"compound/compound": b"CPD\n"}) + with pytest.raises(FileNotFoundError, match="required file"): + extract_kegg_dump(tmp_path) diff --git a/tests/test_reconstruction_kegg_hmm.py b/tests/test_reconstruction_kegg_hmm.py new file mode 100644 index 0000000..3f559ec --- /dev/null +++ b/tests/test_reconstruction_kegg_hmm.py @@ -0,0 +1,326 @@ +"""Tests for KEGG HMM-library construction (taxonomy + hmm, step 3b.3).""" +from pathlib import Path + +import pandas as pd +import pytest + +from raven_python.reconstruction.kegg import ( + build_ko_fastas, + organism_domains, + organisms_in_domain, + parse_taxonomy, +) +from raven_python.reconstruction.kegg import hmm as hmm_mod +from raven_python.reconstruction.kegg.hmm import ( + _cdhit_cmd, + _cdhit_word_size, + _fasta_stats, + _hmmbuild_cmd, + _mafft_cmd, + build_ko_hmm, +) + +DUMP = Path(__file__).parent / "data" / "kegg_dump" + + +@pytest.fixture +def organism_gene_ko(): + return pd.DataFrame( + [ + ("bsu", "BSU31050", "K01194"), + ("bsu", "BSU31060", "K01194"), + ("hsa", "124", "K01194"), + ("hsa", "125", "K01194"), + ("eco", "b0001", "K00002"), + ], + columns=["organism", "gene", "ko"], + ) + + +# --------------------------------------------------------------------------- # +# Taxonomy +# --------------------------------------------------------------------------- # +def test_parse_taxonomy_lineages(): + cats = parse_taxonomy(DUMP / "taxonomy") + assert cats["bsu"] == ["Prokaryotes", "Bacteria", "Firmicutes"] + assert cats["hsa"][0] == "Eukaryotes" + assert cats["eco"][1] == "Bacteria" + + +def test_organism_domains(): + assert organism_domains(DUMP / "taxonomy") == { + "bsu": "Prokaryotes", + "eco": "Prokaryotes", + "hsa": "Eukaryotes", + } + + +def test_organisms_in_domain_prefix_match(): + assert organisms_in_domain(DUMP / "taxonomy", "prok") == {"bsu", "eco"} + assert organisms_in_domain(DUMP / "taxonomy", "Eukaryotes") == {"hsa"} + + +def test_parse_taxonomy_handles_skipped_depth(tmp_path): + """A ``##`` directly under a ``#`` (skipping ``##`` level) used to corrupt + the stack. Now pads with '' placeholders and warns once (known_issues.md C4).""" + p = tmp_path / "tax" + p.write_text( + "#Domain1\n" + "###Skipped\n" # skips ## + "T9999\torg1\tan org\n" + ) + with pytest.warns(UserWarning, match="depth skips a level"): + cats = parse_taxonomy(p) + # Domain still recoverable; the missing level is a placeholder. + assert cats["org1"][0] == "Domain1" + assert cats["org1"][-1] == "Skipped" + + +# --------------------------------------------------------------------------- # +# build_ko_fastas (constructMultiFasta) +# --------------------------------------------------------------------------- # +def test_build_ko_fastas_groups_by_ko(organism_gene_ko, tmp_path): + written = build_ko_fastas(organism_gene_ko, DUMP / "genes.pep", tmp_path) + assert set(written) == {"K01194", "K00002"} + k01194 = (tmp_path / "K01194.fa").read_text() + assert k01194.count(">") == 4 # bsu x2 + hsa x2 + assert ">bsu:BSU31050" in k01194 + assert ">xxx:unused" not in k01194 # gene not in any KO is excluded + + +def test_build_ko_fastas_domain_filter(organism_gene_ko, tmp_path): + prok = organisms_in_domain(DUMP / "taxonomy", "prokaryotes") + written = build_ko_fastas(organism_gene_ko, DUMP / "genes.pep", tmp_path, organisms=prok) + # Only prokaryote genes: K01194 keeps bsu (2), K00002 keeps eco (1). + assert (tmp_path / "K01194.fa").read_text().count(">") == 2 + assert ">hsa:" not in (tmp_path / "K01194.fa").read_text() + assert set(written) == {"K01194", "K00002"} + + +def test_build_ko_fastas_sequences_intact(organism_gene_ko, tmp_path): + build_ko_fastas(organism_gene_ko, DUMP / "genes.pep", tmp_path) + text = (tmp_path / "K00002.fa").read_text() + assert text.startswith(">eco:b0001") + assert "MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNHLVAMIEKTISGQDA" in text + + +# --------------------------------------------------------------------------- # +# Command builders / CD-HIT word size (pure) +# --------------------------------------------------------------------------- # +@pytest.mark.parametrize( + "identity, expected", + [(0.9, "5"), (0.7, "4"), (0.65, "4"), (0.55, "3"), (0.45, "2")], +) +def test_cdhit_word_size(identity, expected): + assert _cdhit_word_size(identity) == expected + + +def test_cdhit_word_size_out_of_range(): + with pytest.raises(ValueError, match="seq_identity"): + _cdhit_word_size(0.3) + + +def test_command_builders(): + cd = _cdhit_cmd("cd-hit", Path("in.fa"), Path("out.fa"), 0.9, 4) + assert cd[:3] == ["cd-hit", "-i", "in.fa"] + assert "-c" in cd and "0.9" in cd and "-n" in cd and "5" in cd + # Default is fast progressive (FFT-NS-2), not --auto. + assert _mafft_cmd("mafft", Path("in.fa"), 2) == [ + "mafft", "--retree", "2", "--maxiterate", "0", "--anysymbol", "--thread", "2", "in.fa" + ] + assert _mafft_cmd("mafft", Path("in.fa"), 2, fast=False)[:2] == ["mafft", "--auto"] + assert "--parttree" in _mafft_cmd("mafft", Path("in.fa"), 2, parttree=True) + assert _hmmbuild_cmd("hmmbuild", Path("o.hmm"), Path("a.fa"), 3) == [ + "hmmbuild", "--cpu", "3", "o.hmm", "a.fa" + ] + + +# --------------------------------------------------------------------------- # +# build_ko_hmm orchestration (binaries mocked) +# --------------------------------------------------------------------------- # +def test_build_ko_hmm_multi_sequence_runs_full_pipeline(tmp_path, monkeypatch): + fasta = tmp_path / "K01194.fa" + fasta.write_text(">a\nMKV\n>b\nMRV\n") + calls = [] + + monkeypatch.setattr( + "raven_python.reconstruction.kegg.hmm.resolve_binary", + lambda exe, binary=None: binary or exe, + ) + + def fake_run(cmd, *, stdout_path=None): + calls.append(Path(cmd[0]).name) + # Emulate each tool producing its expected output file. + if stdout_path is not None: + Path(stdout_path).write_text(">a\nMKV\n>b\nMRV\n") + if Path(cmd[0]).name == "cd-hit": + Path(cmd[cmd.index("-o") + 1]).write_text(">a\nMKV\n>b\nMRV\n") + if Path(cmd[0]).name == "hmmbuild": + Path(cmd[-2]).write_text("HMM\n") + return "" + + monkeypatch.setattr("raven_python.reconstruction.kegg.hmm._run", fake_run) + out = build_ko_hmm(fasta, tmp_path / "K01194.hmm") + assert calls == ["cd-hit", "mafft", "hmmbuild"] + assert out.read_text() == "HMM\n" + + +def test_build_ko_hmm_single_sequence_skips_align(tmp_path, monkeypatch): + fasta = tmp_path / "K9.fa" + fasta.write_text(">only\nMKV\n") + calls = [] + monkeypatch.setattr( + "raven_python.reconstruction.kegg.hmm.resolve_binary", + lambda exe, binary=None: binary or exe, + ) + + def fake_run(cmd, *, stdout_path=None): + calls.append(Path(cmd[0]).name) + if Path(cmd[0]).name == "hmmbuild": + Path(cmd[-2]).write_text("HMM\n") + return "" + + monkeypatch.setattr("raven_python.reconstruction.kegg.hmm._run", fake_run) + build_ko_hmm(fasta, tmp_path / "K9.hmm") + assert calls == ["hmmbuild"] # no cd-hit / mafft for a lone sequence + + +def test_build_ko_hmm_verbose_logs_each_stage(tmp_path, monkeypatch, caplog): + fasta = tmp_path / "K01194.fa" + fasta.write_text(">a\nMKV\n>b\nMRV\n") + monkeypatch.setattr( + "raven_python.reconstruction.kegg.hmm.resolve_binary", lambda exe, binary=None: binary or exe + ) + + def fake_run(cmd, *, stdout_path=None): + if stdout_path is not None: + Path(stdout_path).write_text(">a\nMKV\n>b\nMRV\n") + if Path(cmd[0]).name == "cd-hit": + Path(cmd[cmd.index("-o") + 1]).write_text(">a\nMKV\n>b\nMRV\n") + if Path(cmd[0]).name == "hmmbuild": + Path(cmd[-2]).write_text("HMM\n") + return "" + + monkeypatch.setattr("raven_python.reconstruction.kegg.hmm._run", fake_run) + with caplog.at_level("INFO", logger="raven_python.reconstruction.kegg.hmm"): + build_ko_hmm(fasta, tmp_path / "K01194.hmm", verbose=True) + text = caplog.text + # Each stage is logged, labelled with the KO id. + assert "[K01194] start: 2 sequences" in text + assert "[K01194] CD-HIT" in text + assert "[K01194] MAFFT" in text + assert "[K01194] hmmbuild: done in" in text + # Each stage is a single line: the tool/params and the timing together, not split. + assert "running" not in text + assert "[K01194] complete" in text + + +def test_build_ko_hmm_quiet_by_default(tmp_path, monkeypatch, caplog): + fasta = tmp_path / "K9.fa" + fasta.write_text(">only\nMKV\n") + monkeypatch.setattr( + "raven_python.reconstruction.kegg.hmm.resolve_binary", lambda exe, binary=None: binary or exe + ) + monkeypatch.setattr( + "raven_python.reconstruction.kegg.hmm._run", + lambda cmd, *, stdout_path=None: Path(cmd[-2]).write_text("HMM\n") and "", + ) + with caplog.at_level("INFO", logger="raven_python.reconstruction.kegg.hmm"): + build_ko_hmm(fasta, tmp_path / "K9.hmm") # verbose defaults False + assert caplog.text == "" + + +def test_fasta_stats_counts_residues(tmp_path): + fa = tmp_path / "x.fa" + fa.write_text(">a\nMKVL\nAAG\n>b\nMR\n") # a=7 residues (2 lines), b=2 + assert _fasta_stats(fa) == (2, 9) + + +def test_auto_cost_budget_scales_with_memory(monkeypatch): + hmm_mod._auto_cost_budget.cache_clear() + monkeypatch.setattr(hmm_mod, "_total_memory_bytes", lambda: 64 * 1024**3) + big = hmm_mod._auto_cost_budget() + hmm_mod._auto_cost_budget.cache_clear() + monkeypatch.setattr(hmm_mod, "_total_memory_bytes", lambda: 8 * 1024**3) + small = hmm_mod._auto_cost_budget() + assert big > small > 0 # more RAM -> larger DP-cost budget + hmm_mod._auto_cost_budget.cache_clear() + + +def test_auto_cost_budget_warns_on_low_memory(monkeypatch, caplog): + hmm_mod._auto_cost_budget.cache_clear() + monkeypatch.setattr(hmm_mod, "_total_memory_bytes", lambda: 7 * 1024**3) + with caplog.at_level("WARNING", logger="raven_python.reconstruction.kegg.hmm"): + hmm_mod._auto_cost_budget() + assert "Limited memory" in caplog.text + hmm_mod._auto_cost_budget.cache_clear() + + +def test_auto_cost_budget_falls_back_without_detection(monkeypatch, caplog): + hmm_mod._auto_cost_budget.cache_clear() + monkeypatch.setattr(hmm_mod, "_total_memory_bytes", lambda: None) + with caplog.at_level("WARNING", logger="raven_python.reconstruction.kegg.hmm"): + assert hmm_mod._auto_cost_budget() == hmm_mod._DEFAULT_COST_BUDGET + assert "Could not detect system memory" in caplog.text + hmm_mod._auto_cost_budget.cache_clear() + + +def test_long_proteins_route_to_parttree(monkeypatch, tmp_path): + # Few but very long sequences (K12047-like): low residue count, high DP cost, + # so the length-aware budget must pick PartTree (a residue-only rule would not). + fasta = tmp_path / "K12047.fa" + fasta.write_text("".join(f">g{i}\n{'M' * 2000}\n" for i in range(300))) # 300 x 2000 aa + monkeypatch.setattr(hmm_mod, "resolve_binary", lambda exe, binary=None: binary or exe) + hmm_mod._auto_cost_budget.cache_clear() + monkeypatch.setattr(hmm_mod, "_total_memory_bytes", lambda: 8 * 1024**3) + seen = {} + + def fake_run(cmd, *, stdout_path=None): + name = Path(cmd[0]).name + if name == "cd-hit": + Path(cmd[cmd.index("-o") + 1]).write_text(fasta.read_text()) + if name == "mafft": + seen["parttree"] = "--parttree" in cmd + Path(stdout_path).write_text(fasta.read_text()) + if name == "hmmbuild": + Path(cmd[-2]).write_text("HMM\n") + return "" + + monkeypatch.setattr(hmm_mod, "_run", fake_run) + build_ko_hmm(fasta, tmp_path / "K12047.hmm") + hmm_mod._auto_cost_budget.cache_clear() + # 300x2000 = 600k residues (a residue rule with a ~1M cutoff would NOT trigger), + # but DP cost 1.2e9 exceeds the 8 GB budget -> PartTree. + assert seen["parttree"] is True + + +def test_parttree_residues_param_overrides_auto(tmp_path, monkeypatch): + # The explicit parttree_residues argument decides the MAFFT method (residues only). + fasta = tmp_path / "K.fa" + fasta.write_text("".join(f">g{i}\n{'M' * 1000}\n" for i in range(5))) # 5000 residues + monkeypatch.setattr(hmm_mod, "resolve_binary", lambda exe, binary=None: binary or exe) + seen = {} + + def fake_run(cmd, *, stdout_path=None): + name = Path(cmd[0]).name + if name == "cd-hit": + Path(cmd[cmd.index("-o") + 1]).write_text(fasta.read_text()) + if name == "mafft": + seen["parttree"] = "--parttree" in cmd + Path(stdout_path).write_text(fasta.read_text()) + if name == "hmmbuild": + Path(cmd[-2]).write_text("HMM\n") + return "" + + monkeypatch.setattr(hmm_mod, "_run", fake_run) + build_ko_hmm(fasta, tmp_path / "a.hmm", parttree_residues=10_000) # 5000 < 10000 + assert seen["parttree"] is False # stays on FFT-NS-2 + build_ko_hmm(fasta, tmp_path / "b.hmm", parttree_residues=4000) # 5000 > 4000 + assert seen["parttree"] is True # switches to PartTree + + +def test_build_ko_hmm_empty_fasta_raises(tmp_path): + fasta = tmp_path / "empty.fa" + fasta.write_text("") + with pytest.raises(ValueError, match="no sequences"): + build_ko_hmm(fasta, tmp_path / "empty.hmm") diff --git a/tests/test_reconstruction_kegg_organism.py b/tests/test_reconstruction_kegg_organism.py new file mode 100644 index 0000000..f64f15b --- /dev/null +++ b/tests/test_reconstruction_kegg_organism.py @@ -0,0 +1,179 @@ +"""Tests for get_kegg_model_for_organism (KEGG organism-ID mode, step 3b.4).""" +from pathlib import Path + +import cobra +import pandas as pd +import pytest + +from raven_python.reconstruction.kegg import ( + build_kegg_tables, + build_reference_model, + get_kegg_model_for_organism, + get_kegg_model_for_organism_from_artefacts, + parse_kegg_compounds, + parse_kegg_dump, + parse_kegg_reactions, +) + +DUMP = Path(__file__).parent / "data" / "kegg_dump" + + +@pytest.fixture(scope="module") +def artefacts(): + reactions = parse_kegg_reactions(DUMP) + compounds = parse_kegg_compounds(DUMP) + linked = {ko for r in reactions for ko in r.kos} + from raven_python.reconstruction.kegg import parse_kegg_kos + + kos = parse_kegg_kos(DUMP, keep=linked) + model = build_reference_model(reactions, compounds) + tables = build_kegg_tables(reactions, kos) + return model, tables + + +def _build(artefacts, organism_id, **kw): + model, tables = artefacts + return get_kegg_model_for_organism( + organism_id, + model, + tables["ko_reaction"], + tables["organism_gene_ko"], + rxn_flags=tables["rxn_flags"], + **kw, + ) + + +# --------------------------------------------------------------------------- # +# Core behaviour +# --------------------------------------------------------------------------- # +def test_eco_keeps_only_its_reactions(artefacts): + # eco has b0001 -> K00002 -> R00100 only. + model = _build(artefacts, "eco") + assert {r.id for r in model.reactions} == {"R00100"} + assert model.id == "eco" + + +def test_eco_gpr_and_gene_annotation(artefacts): + model = _build(artefacts, "eco") + r = model.reactions.get_by_id("R00100") + assert r.gene_reaction_rule == "b0001" + assert model.genes.get_by_id("b0001").annotation["kegg.genes"] == "eco:b0001" + assert r.notes["note"].startswith("Included by get_kegg_model_for_organism") + + +def test_bsu_or_joins_multiple_genes(artefacts): + # bsu has BSU31050 + BSU31060, both -> K01194 -> R00010. + model = _build(artefacts, "bsu") + r = model.reactions.get_by_id("R00010") + assert set(r.genes) == {model.genes.get_by_id("BSU31050"), model.genes.get_by_id("BSU31060")} + assert r.gene_reaction_rule == "BSU31050 or BSU31060" + + +def test_case_insensitive_organism(artefacts): + assert "R00010" in _build(artefacts, "BSU").reactions + + +def test_orphan_metabolites_pruned(artefacts): + # eco keeps only R00100 (C00002, C00003); trehalose/glucose mets should go. + model = _build(artefacts, "eco") + assert {m.id for m in model.metabolites} == {"C00002", "C00003"} + + +def test_reference_model_unmodified(artefacts): + reference, _ = artefacts + before = len(reference.reactions) + _build(artefacts, "eco") + assert len(reference.reactions) == before # worked on a copy + assert len(reference.genes) == 0 + + +# --------------------------------------------------------------------------- # +# Spontaneous handling +# --------------------------------------------------------------------------- # +def test_spontaneous_reaction_kept_without_genes(artefacts): + # R00100 is spontaneous; for bsu it has no genes but is kept (no GPR). + model = _build(artefacts, "bsu", keep_spontaneous=True) + assert "R00100" in model.reactions + assert model.reactions.get_by_id("R00100").gene_reaction_rule == "" + + +def test_spontaneous_dropped_when_disabled(artefacts): + model = _build(artefacts, "bsu", keep_spontaneous=False) + assert "R00100" not in model.reactions + assert "R00010" in model.reactions # the gene-backed reaction stays + + +# --------------------------------------------------------------------------- # +# Quality filters take precedence over having genes +# --------------------------------------------------------------------------- # +def _tiny_general_case(): + ref = cobra.Model("KEGG") + a = cobra.Metabolite("C1", compartment="s") + b = cobra.Metabolite("C2", compartment="s") + ref.add_metabolites([a, b]) + rxn = cobra.Reaction("R1") + ref.add_reactions([rxn]) + rxn.add_metabolites({a: -1, b: 1}) + ko_reaction = pd.DataFrame([("K1", "R1")], columns=["ko", "reaction"]) + ogk = pd.DataFrame([("xyz", "g1", "K1")], columns=["organism", "gene", "ko"]) + flags = pd.DataFrame( + [("R1", False, False, False, True)], + columns=["reaction", "spontaneous", "undefined_stoich", "incomplete", "general"], + ) + return ref, ko_reaction, ogk, flags + + +def test_general_filter_drops_reaction_with_genes(): + ref, ko_reaction, ogk, flags = _tiny_general_case() + model = get_kegg_model_for_organism("xyz", ref, ko_reaction, ogk, rxn_flags=flags) + assert "R1" not in model.reactions # general + keep_general=False (default) + + +def test_general_kept_when_enabled(): + ref, ko_reaction, ogk, flags = _tiny_general_case() + model = get_kegg_model_for_organism( + "xyz", ref, ko_reaction, ogk, rxn_flags=flags, keep_general=True + ) + assert model.reactions.get_by_id("R1").gene_reaction_rule == "g1" + + +# --------------------------------------------------------------------------- # +# Validation + artefact loading +# --------------------------------------------------------------------------- # +def test_unknown_organism_raises(artefacts): + with pytest.raises(ValueError, match="no genes"): + _build(artefacts, "zzz") + + +def test_domain_mode_needs_taxonomy(artefacts): + with pytest.raises(ValueError, match="taxonomy"): + _build(artefacts, "eukaryotes") + + +def test_domain_mode_keeps_all_domain_organisms(artefacts): + # Prokaryotes (bsu + eco) -> R00010 (bsu genes) and R00100 (eco gene). + model = _build(artefacts, "prokaryotes", taxonomy=DUMP / "taxonomy") + assert "R00010" in model.reactions + assert "R00100" in model.reactions + # Genes are organism-qualified in domain mode to stay distinct. + assert {g.id for g in model.reactions.get_by_id("R00010").genes} == { + "bsu:BSU31050", + "bsu:BSU31060", + } + + +def test_domain_mode_eukaryotes(artefacts): + # Eukaryotes (hsa) -> R00010 via hsa:124/125; eco-only R00100 absent of genes + # but it is spontaneous, so kept without GPR. + model = _build(artefacts, "eukaryotes", taxonomy=DUMP / "taxonomy") + assert {g.id for g in model.reactions.get_by_id("R00010").genes} == { + "hsa:124", + "hsa:125", + } + + +def test_from_artefacts_roundtrip(tmp_path): + parse_kegg_dump(DUMP, tmp_path) + model = get_kegg_model_for_organism_from_artefacts("eco", tmp_path) + assert {r.id for r in model.reactions} == {"R00100"} + assert model.reactions.get_by_id("R00100").gene_reaction_rule == "b0001" diff --git a/tests/test_reconstruction_kegg_parse.py b/tests/test_reconstruction_kegg_parse.py new file mode 100644 index 0000000..23d8f71 --- /dev/null +++ b/tests/test_reconstruction_kegg_parse.py @@ -0,0 +1,220 @@ +"""Tests for the KEGG dump parser (reconstruction/kegg/parse.py, step 3b.2).""" +from pathlib import Path + +import pytest + +from raven_python.reconstruction.kegg import ( + build_kegg_tables, + build_reference_model, + parse_kegg_compounds, + parse_kegg_dump, + parse_kegg_kos, + parse_kegg_reactions, + read_kegg_table, + write_kegg_tables, +) + +DUMP = Path(__file__).parent / "data" / "kegg_dump" + + +@pytest.fixture(scope="module") +def reactions(): + return parse_kegg_reactions(DUMP) + + +@pytest.fixture(scope="module") +def compounds(): + return parse_kegg_compounds(DUMP) + + +@pytest.fixture(scope="module") +def kos(): + linked = {ko for r in parse_kegg_reactions(DUMP) for ko in r.kos} + return parse_kegg_kos(DUMP, keep=linked) + + +# --------------------------------------------------------------------------- # +# Reactions +# --------------------------------------------------------------------------- # +def test_reactions_parsed(reactions): + assert {r.id for r in reactions} == {"R00010", "R00100", "R00200", "R00300", "R00400"} + + +def test_reaction_fields(reactions): + r = next(r for r in reactions if r.id == "R00010") + assert r.name == "alpha,alpha-trehalose glucohydrolase" + assert r.eccodes == ["3.2.1.28"] + assert r.kos == ["K01194"] + # rn01100 is an overview map and must be skipped. + assert r.pathways == ["rn00500"] + + +def test_stoichiometry_cached(reactions): + """parse_kegg_reactions populates the cached stoichiometry so + build_reference_model doesn't have to re-parse (known_issues.md D2).""" + r = next(r for r in reactions if r.id == "R00010") + assert r.stoichiometry # non-empty + # Reactants negative, products positive. + assert all(c != 0 for c in r.stoichiometry.values()) + assert any(c < 0 for c in r.stoichiometry.values()) + assert any(c > 0 for c in r.stoichiometry.values()) + + +def test_spontaneous_flag(reactions): + assert next(r for r in reactions if r.id == "R00100").spontaneous + assert not next(r for r in reactions if r.id == "R00010").spontaneous + + +def test_general_flag(reactions): + assert next(r for r in reactions if r.id == "R00300").general + + +def test_undefined_stoich_flag(reactions): + assert next(r for r in reactions if r.id == "R00200").undefined_stoich + assert not next(r for r in reactions if r.id == "R00010").undefined_stoich + + +def test_mapformula_makes_irreversible(reactions): + # R00100 is drawn one direction in its only map -> irreversible. + assert not next(r for r in reactions if r.id == "R00100").reversible + # R00010 is drawn in conflicting directions across maps -> stays reversible. + assert next(r for r in reactions if r.id == "R00010").reversible + + +# --------------------------------------------------------------------------- # +# Compounds +# --------------------------------------------------------------------------- # +def test_compound_first_name_only(compounds): + water = next(c for c in compounds if c.id == "C00001") + assert water.name == "H2O" + assert water.chebi == ["CHEBI:15377"] + assert water.pubchem == ["3303"] + + +def test_inchi_overrides_formula(compounds): + glucose = next(c for c in compounds if c.id == "C00031") + assert glucose.inchi.startswith("InChI=") + assert glucose.formula == "" # cleared when an InChI is available + assert glucose.chebi == ["CHEBI:4167", "CHEBI:17634"] + + +# --------------------------------------------------------------------------- # +# KOs / genes +# --------------------------------------------------------------------------- # +def test_kos_limited_to_keep(kos): + # K99999 is unlinked (excluded by keep); K09999 is referenced but absent. + assert {ko.id for ko in kos} == {"K01194", "K00002"} + + +def test_ko_genes_lowercased_and_stripped(kos): + k = next(ko for ko in kos if ko.id == "K01194") + assert k.name == "alpha,alpha-trehalase [EC:3.2.1.28]" + assert ("bsu", "BSU31050") in k.genes # '(gbsB)' suffix stripped, org lowercased + assert ("hsa", "125") in k.genes + + +# --------------------------------------------------------------------------- # +# Reference model +# --------------------------------------------------------------------------- # +def test_reference_model_is_gene_free(reactions, compounds): + model = build_reference_model(reactions, compounds) + assert len(model.genes) == 0 + for rxn in model.reactions: + assert rxn.gene_reaction_rule == "" + + +def test_empty_reaction_dropped(reactions, compounds): + model = build_reference_model(reactions, compounds) + assert "R00400" not in model.reactions # C00007 <=> C00007 cancels out + assert "C00007" not in model.metabolites # and its only metabolite is unused + + +def test_reaction_bounds_follow_reversibility(reactions, compounds): + model = build_reference_model(reactions, compounds) + assert model.reactions.get_by_id("R00010").bounds == (-1000.0, 1000.0) + assert model.reactions.get_by_id("R00100").bounds == (0.0, 1000.0) + + +def test_reaction_stoichiometry_and_annotation(reactions, compounds): + model = build_reference_model(reactions, compounds) + r = model.reactions.get_by_id("R00010") + coefs = {m.id: c for m, c in r.metabolites.items()} + assert coefs == {"C01083": -1.0, "C00001": -1.0, "C00031": 2.0} + assert r.annotation["kegg.orthology"] == ["K01194"] + assert r.annotation["ec-code"] == ["3.2.1.28"] + + +def test_metabolite_annotation(reactions, compounds): + model = build_reference_model(reactions, compounds) + glucose = model.metabolites.get_by_id("C00031") + assert glucose.name == "D-Glucose" + assert glucose.annotation["inchi"].startswith("InChI=") + + +# --------------------------------------------------------------------------- # +# Tables +# --------------------------------------------------------------------------- # +def test_ko_reaction_table(reactions, kos): + tables = build_kegg_tables(reactions, kos) + pairs = set(map(tuple, tables["ko_reaction"].to_numpy())) + assert ("K01194", "R00010") in pairs + assert ("K09999", "R00300") in pairs # kept even though KO entry is missing + + +def test_organism_gene_ko_table(reactions, kos): + tables = build_kegg_tables(reactions, kos) + rows = set(map(tuple, tables["organism_gene_ko"].to_numpy())) + assert ("bsu", "BSU31050", "K01194") in rows + assert ("eco", "b0001", "K00002") in rows + assert len(rows) == 5 + + +def test_rxn_flags_table(reactions, kos): + tables = build_kegg_tables(reactions, kos) + flags = tables["rxn_flags"].set_index("reaction") + assert bool(flags.loc["R00100", "spontaneous"]) + assert bool(flags.loc["R00200", "undefined_stoich"]) + assert bool(flags.loc["R00300", "general"]) + assert not bool(flags.loc["R00010", "spontaneous"]) + + +# --------------------------------------------------------------------------- # +# Round-trip + orchestrator +# --------------------------------------------------------------------------- # +def test_tables_roundtrip_gzipped_tsv(reactions, kos, tmp_path): + tables = build_kegg_tables(reactions, kos) + paths = write_kegg_tables(tables, tmp_path) + assert all(p.name.endswith(".tsv.gz") for p in paths) + back = read_kegg_table(tmp_path / "ko_reaction.tsv.gz") + assert set(map(tuple, back.to_numpy())) == set(map(tuple, tables["ko_reaction"].to_numpy())) + + +def test_parse_kegg_dump_writes_artefacts(tmp_path): + paths = parse_kegg_dump(DUMP, tmp_path) + assert set(paths) >= { + "ko_reaction", "ko_names", "organism_gene_ko", "rxn_flags", "reference_model" + } + assert (tmp_path / "reference_model.yml.gz").is_file() + # organism_gene_ko is streamed to a sorted, xz-compressed TSV. + assert paths["organism_gene_ko"].name == "organism_gene_ko.tsv.xz" + ogk = read_kegg_table(paths["organism_gene_ko"]) + assert set(ogk.columns) == {"organism", "gene", "ko"} + assert ("eco", "b0001", "K00002") in set(map(tuple, ogk.to_numpy())) + # Rows are sorted by (organism, gene) — the property that makes them compress. + keys = list(zip(ogk["organism"], ogk["gene"], strict=True)) + assert keys == sorted(keys) + + +def test_stream_organism_gene_ko_external_merge(tmp_path): + """A tiny chunk_rows forces multiple sorted runs to be merged; output stays sorted.""" + from raven_python.reconstruction.kegg.parse import stream_organism_gene_ko + + out = tmp_path / "organism_gene_ko.tsv.xz" + keep = {ko.id for ko in parse_kegg_kos(DUMP)} + names = stream_organism_gene_ko(DUMP, keep, out, chunk_rows=1) + assert out.is_file() and not list(tmp_path.glob("ogk_sort_*")) # temp dir cleaned up + ogk = read_kegg_table(out) + keys = list(zip(ogk["organism"], ogk["gene"], strict=True)) + assert keys == sorted(keys) + assert ("eco", "b0001", "K00002") in set(map(tuple, ogk.to_numpy())) + assert set(names.columns) == {"ko", "name"} diff --git a/tests/test_reconstruction_kegg_query.py b/tests/test_reconstruction_kegg_query.py new file mode 100644 index 0000000..49aae60 --- /dev/null +++ b/tests/test_reconstruction_kegg_query.py @@ -0,0 +1,132 @@ +"""Tests for the KEGG HMM-query path (reconstruction/kegg/query.py, step 3b.5).""" +from pathlib import Path + +import pandas as pd +import pytest + +from raven_python.reconstruction.kegg import ( + assign_kos, + build_kegg_tables, + build_reference_model, + get_kegg_model_from_sequences, + parse_hmmscan_tblout, + parse_kegg_compounds, + parse_kegg_kos, + parse_kegg_reactions, +) + +DUMP = Path(__file__).parent / "data" / "kegg_dump" + +# A minimal hmmscan --tblout excerpt: target(KO) accession query(gene) ... evalue ... +TBLOUT = """\ +# --- full sequence ---- +# target name accession query name accession E-value score bias +#------------------- ---------- ----------- ---------- --------- ------ ----- +K01194 - gene1 - 1e-120 400.0 0.0 +K01194 - gene2 - 1e-100 350.0 0.0 +K00002 - gene1 - 1e-10 40.0 0.0 +""" + + +# --------------------------------------------------------------------------- # +# Parsing +# --------------------------------------------------------------------------- # +def test_parse_tblout_skips_comments(): + hits = parse_hmmscan_tblout(TBLOUT) + assert list(hits.columns) == ["ko", "gene", "evalue"] + assert len(hits) == 3 + assert set(hits["ko"]) == {"K01194", "K00002"} + assert hits.iloc[0]["evalue"] == 1e-120 + + +def test_parse_tblout_empty(): + assert parse_hmmscan_tblout("# only a header\n").empty + + +# --------------------------------------------------------------------------- # +# assign_kos scoring/filters +# --------------------------------------------------------------------------- # +def test_cutoff_excludes_weak_hits(): + hits = parse_hmmscan_tblout(TBLOUT) + # gene1->K00002 has evalue 1e-10, above the default cutoff 1e-30: dropped. + assigned = assign_kos(hits) + assert "K00002" not in assigned + assert set(assigned["K01194"]) == {"gene1", "gene2"} + + +def test_loose_cutoff_keeps_hit(): + hits = parse_hmmscan_tblout(TBLOUT) + assigned = assign_kos(hits, cutoff=1e-5, min_score_ratio_g=0.0, min_score_ratio_ko=0.0) + assert assigned.get("K00002") == ["gene1"] + + +def test_min_score_ratio_ko_prunes_weak_member(): + # In one KO: best 1e-200, weak 1e-20. log(1e-20)/log(1e-200)=0.1 < 0.3 -> pruned. + hits = pd.DataFrame( + [("K1", "strong", 1e-200), ("K1", "weak", 1e-20)], + columns=["ko", "gene", "evalue"], + ) + assigned = assign_kos(hits, cutoff=1e-5, min_score_ratio_ko=0.3, min_score_ratio_g=0.0) + assert assigned["K1"] == ["strong"] + + +def test_min_score_ratio_g_keeps_gene_in_best_ko_only(): + # gene g hits K1 strongly (1e-200) and K2 weakly (1e-20). + # For the gene: log(1e-20)/log(1e-200)=0.1 < 0.8 -> K2 assignment dropped. + hits = pd.DataFrame( + [("K1", "g", 1e-200), ("K2", "g", 1e-20)], + columns=["ko", "gene", "evalue"], + ) + assigned = assign_kos(hits, cutoff=1e-5, min_score_ratio_ko=0.0, min_score_ratio_g=0.8) + assert assigned == {"K1": ["g"]} + + +def test_zero_evalue_does_not_crash(): + hits = pd.DataFrame([("K1", "g", 0.0)], columns=["ko", "gene", "evalue"]) + assert assign_kos(hits) == {"K1": ["g"]} + + +def test_cutoff_ge_one_rejected(): + """cutoff >= 1 would let log(best_evalue)=0 through and ZeroDivisionError later + (known_issues.md A6). Reject up front with a clear message.""" + hits = pd.DataFrame([("K1", "g", 0.5)], columns=["ko", "gene", "evalue"]) + with pytest.raises(ValueError, match="cutoff must be < 1"): + assign_kos(hits, cutoff=1.0) + + +# --------------------------------------------------------------------------- # +# Model assembly via the HMM path (hmmscan mocked) +# --------------------------------------------------------------------------- # +@pytest.fixture(scope="module") +def reference_and_tables(): + reactions = parse_kegg_reactions(DUMP) + compounds = parse_kegg_compounds(DUMP) + linked = {ko for r in reactions for ko in r.kos} + kos = parse_kegg_kos(DUMP, keep=linked) + return build_reference_model(reactions, compounds), build_kegg_tables(reactions, kos) + + +def test_get_model_from_sequences(reference_and_tables, monkeypatch): + model_ref, tables = reference_and_tables + # Mock the HMM search: K01194 -> myGeneA/myGeneB (-> R00010). + monkeypatch.setattr( + "raven_python.reconstruction.kegg.query.run_hmmscan", + lambda *a, **k: ( + "K01194 - myGeneA - 1e-120 400 0\n" + "K01194 - myGeneB - 1e-110 380 0\n" + ), + ) + model = get_kegg_model_from_sequences( + "ignored.fasta", + model_ref, + tables["ko_reaction"], + "ignored.hmm", + rxn_flags=tables["rxn_flags"], + model_id="myorg", + ) + assert model.id == "myorg" + r = model.reactions.get_by_id("R00010") + assert set(r.gene_reaction_rule.split(" or ")) == {"myGeneA", "myGeneB"} + assert r.notes["note"].endswith("(using HMMs)") + # R00200/R00300 had no matched KOs and are not spontaneous -> absent. + assert "R00200" not in model.reactions diff --git a/tests/test_scripts_registry.py b/tests/test_scripts_registry.py new file mode 100644 index 0000000..c9c03cf --- /dev/null +++ b/tests/test_scripts_registry.py @@ -0,0 +1,58 @@ +"""Tests for scripts/make_registry_snippet.py registry-entry helpers.""" +import hashlib +import importlib.util +import json +from pathlib import Path + +import pytest + +# scripts/ is not a package; load the module directly by path. +_SCRIPT = Path(__file__).resolve().parents[1] / "scripts" / "make_registry_snippet.py" +_spec = importlib.util.spec_from_file_location("make_registry_snippet", _SCRIPT) +mrs = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(mrs) + + +def _sha(data: bytes) -> str: + return hashlib.sha256(data).hexdigest() + + +def test_data_entry_lists_files_with_urls_and_checksums(tmp_path): + (tmp_path / "reference_model.yml.gz").write_bytes(b"model") + (tmp_path / "ko_reaction.tsv.gz").write_bytes(b"table") + (tmp_path / ".hidden").write_bytes(b"skip") # hidden files ignored + + entry = mrs.data_entry("kegg", "kegg116", "https://x/rel/", tmp_path) + assert entry["version"] == "kegg116" + assert set(entry["files"]) == {"reference_model.yml.gz", "ko_reaction.tsv.gz"} + ref = entry["files"]["reference_model.yml.gz"] + assert ref["url"] == "https://x/rel/reference_model.yml.gz" # trailing slash collapsed + assert ref["sha256"] == _sha(b"model") + + +def test_data_entry_empty_dir_errors(tmp_path): + with pytest.raises(SystemExit): + mrs.data_entry("kegg", "v1", "https://x", tmp_path) + + +def test_binary_entry_parses_platform_from_filename(tmp_path): + (tmp_path / "blast-2.16.0-linux-x86_64.zip").write_bytes(b"linux") + (tmp_path / "blast-2.16.0-macos-arm64.zip").write_bytes(b"mac") + (tmp_path / "other-1.0-linux-x86_64.zip").write_bytes(b"nope") # different bundle + + entry = mrs.binary_entry("blast", "2.16.0", ["blastp", "makeblastdb"], "https://x", tmp_path) + assert entry["provides"] == ["blastp", "makeblastdb"] + assert set(entry["platforms"]) == {"linux-x86_64", "macos-arm64"} + assert entry["platforms"]["macos-arm64"]["sha256"] == _sha(b"mac") + assert entry["platforms"]["linux-x86_64"]["url"].endswith("blast-2.16.0-linux-x86_64.zip") + + +def test_binary_entry_no_zips_errors(tmp_path): + with pytest.raises(SystemExit): + mrs.binary_entry("blast", "2.16.0", ["blastp"], "https://x", tmp_path) + + +def test_render_is_valid_json_round_trip(): + entry = {"version": "v1", "files": {"a": {"url": "u", "sha256": "s"}}} + text = mrs.render("kegg", entry) + assert json.loads(text) == {"kegg": entry} diff --git a/tests/test_tasks.py b/tests/test_tasks.py new file mode 100644 index 0000000..1b5e6bd --- /dev/null +++ b/tests/test_tasks.py @@ -0,0 +1,189 @@ +"""Tests for metabolic tasks (Phase 4a): parse_task_list + check_tasks.""" +import cobra +import pytest + +from raven_python.tasks import Task, check_tasks, parse_task_list + +TASK_TSV = ( + "ID\tDESCRIPTION\tIN\tIN UB\tOUT\tOUT LB\tEQU\tSHOULD FAIL\n" + "T1\tgrowth\tglc[e];o2[e]\t10\tbio[c]\t1\t\t\n" + "T2\tinfeasible\t\t\tatp[c]\t1\t\ttrue\n" + "\t\t\t\tnadh[c]\t\t\t\n" + "T3\twithequ\tA[c]\t\tB[c]\t\tA[c] <=> B[c]\t\n" +) + + +# --------------------------------------------------------------------------- # +# parse_task_list +# --------------------------------------------------------------------------- # +@pytest.fixture +def task_file(tmp_path): + p = tmp_path / "tasks.txt" + p.write_text(TASK_TSV) + return p + + +def test_parse_basic_and_defaults(task_file): + tasks = parse_task_list(task_file) + assert [t.id for t in tasks] == ["T1", "T2", "T3"] + t1 = tasks[0] + assert t1.description == "growth" + # ';' splits mets sharing the row's bounds; IN LB defaults 0, IN UB from cell. + assert t1.inputs == [("glc[e]", 0.0, 10.0), ("o2[e]", 0.0, 10.0)] + assert t1.outputs == [("bio[c]", 1.0, 1000.0)] # OUT UB defaults 1000 + + +def test_parse_should_fail_and_continuation(task_file): + t2 = parse_task_list(task_file)[1] + assert t2.should_fail is True + # continuation row (empty ID) appends nadh[c] to the same task's outputs + assert t2.outputs == [("atp[c]", 1.0, 1000.0), ("nadh[c]", 0.0, 1000.0)] + + +def test_parse_equation_default_bounds(task_file): + t3 = parse_task_list(task_file)[2] + # reversible '<=>' -> EQU LB defaults -1000, UB 1000 + assert t3.equations == [("A[c] <=> B[c]", -1000.0, 1000.0)] + + +def test_parse_missing_id_column(tmp_path): + p = tmp_path / "bad.txt" + p.write_text("FOO\tBAR\nx\ty\n") + with pytest.raises(ValueError, match="ID"): + parse_task_list(p) + + +def test_parse_warns_on_data_row_before_first_id(tmp_path): + """known_issues.md B3: continuation rows appearing before the first task ID + used to be silently dropped. Now warns so the user sees the malformed file.""" + p = tmp_path / "orphan.txt" + p.write_text( + "ID\tDESCRIPTION\tIN\tIN UB\tOUT\tOUT UB\tSHOULD FAIL\n" + "\t\tglc[e]\t10\t\t\t\n" # orphan data row, no ID seen yet + "T1\tgrowth\t\t\tbio[c]\t1\t\n" + ) + with pytest.warns(UserWarning, match="no task ID has been seen yet"): + tasks = parse_task_list(p) + assert [t.id for t in tasks] == ["T1"] + # The orphan row's data isn't grafted onto T1 either. + assert tasks[0].inputs == [] + + +def test_parse_task_list_xlsx_missing_tasks_sheet(tmp_path): + """A .xlsx without a 'TASKS' sheet used to raise a bare KeyError; now + raises a clear ValueError naming the actual sheets (known_issues.md C3).""" + pytest.importorskip("openpyxl") + from openpyxl import Workbook + + wb = Workbook() + wb.active.title = "NotTasks" + p = tmp_path / "wrong.xlsx" + wb.save(p) + with pytest.raises(ValueError, match="no sheet named 'TASKS'"): + parse_task_list(p) + + +# --------------------------------------------------------------------------- # +# check_tasks +# --------------------------------------------------------------------------- # +def _met(mid, name, comp="c"): + return cobra.Metabolite(mid, name=name, compartment=comp) + + +@pytest.fixture +def model(): + """Closed model: A -> B (r1); D present but unproduced.""" + m = cobra.Model("t") + A, B, D = _met("A_c", "A"), _met("B_c", "B"), _met("D_c", "D") + m.add_metabolites([A, B, D]) + r1 = cobra.Reaction("r1", lower_bound=0, upper_bound=1000) + r1.add_metabolites({A: -1, B: 1}) + m.add_reactions([r1]) + return m + + +def _by_id(results): + return {r.id: r for r in results} + + +def test_feasible_task_passes(model): + # OUT LB=1 requires producing B (LB=0 would pass trivially via zero flux). + task = Task("make_B", inputs=[("A[c]", 0, 1000)], outputs=[("B[c]", 1, 1000)]) + (res,) = check_tasks(model, [task]) + assert res.feasible and res.passed + + +def test_should_fail_task_passes_when_infeasible(model): + # Require producing B with no input -> infeasible -> should_fail makes it pass. + task = Task("no_input", outputs=[("B[c]", 1, 1000)], should_fail=True) + (res,) = check_tasks(model, [task]) + assert not res.feasible and res.passed + + +def test_unsatisfiable_task_fails(model): + task = Task("need_B", outputs=[("B[c]", 1, 1000)]) # no input, not should_fail + (res,) = check_tasks(model, [task]) + assert not res.feasible and not res.passed + + +def test_equation_adds_pathway(model): + # Model can't make D; the task's extra reaction B -> D enables output of D. + task = Task( + "make_D", + inputs=[("A[c]", 0, 1000)], + outputs=[("D[c]", 1, 1000)], + equations=[("B[c] => D[c]", 0.0, 1000.0)], + ) + (res,) = check_tasks(model, [task]) + assert res.passed + # without the extra reaction D cannot be made + (res2,) = check_tasks(model, [Task("make_D2", inputs=[("A[c]", 0, 1000)], outputs=[("D[c]", 1, 1000)])]) + assert not res2.passed + + +def test_changed_bounds_block_reaction(model): + # Blocking r1 makes B unproducible. + task = Task( + "block_r1", + inputs=[("A[c]", 0, 1000)], + outputs=[("B[c]", 1, 1000)], + changed=[("r1", 0.0, 0.0)], + ) + (res,) = check_tasks(model, [task]) + assert not res.passed + + +def test_allmets_output(model): + # Force uptake of A (IN LB=1); the only fate is A->B, so B must be excreted. + # ALLMETS output permits that, making the task feasible; without it B accumulates. + task = Task("sink_all", inputs=[("A[c]", 1, 1000)], outputs=[("ALLMETS", 0, 1000)]) + (res,) = check_tasks(model, [task]) + assert res.passed + (res2,) = check_tasks(model, [Task("forced_no_out", inputs=[("A[c]", 1, 1000)])]) + assert not res2.passed # forced A uptake but nowhere for B to go + + +def test_unknown_metabolite_reported(model): + task = Task("typo", inputs=[("Z[c]", 0, 1000)], outputs=[("B[c]", 0, 1000)]) + (res,) = check_tasks(model, [task]) + assert not res.passed and "unknown metabolite" in res.error + + +def test_open_exchange_is_closed_so_task_controls_io(model): + # An open demand for B would let B leave for free; check_tasks closes it, so a + # task with no output for B and a forced... here: B has an open sink, but the + # task defines only input A and no output -> B must still balance (sink closed). + model.add_boundary(model.metabolites.B_c, type="sink") # open B sink + task = Task("need_D_out", inputs=[("A[c]", 0, 1000)], outputs=[("D[c]", 1, 1000)]) + (res,) = check_tasks(model, [task]) + assert not res.passed # D still cannot be produced despite the (now-closed) B sink + + +def test_check_tasks_accepts_a_file_path(model, tmp_path): + p = tmp_path / "t.txt" + p.write_text( + "ID\tDESCRIPTION\tIN\tOUT\tOUT LB\n" + "make_B\tconvert\tA[c]\tB[c]\t1\n" + ) + results = check_tasks(model, p) # path, parsed internally + assert _by_id(results)["make_B"].passed diff --git a/tests/test_tasks_essential.py b/tests/test_tasks_essential.py new file mode 100644 index 0000000..5352378 --- /dev/null +++ b/tests/test_tasks_essential.py @@ -0,0 +1,114 @@ +"""Phase 4d.1: essential-reaction discovery for tasks (find_task_essential_reactions). + +Oracle: RAVEN tinitTests T0002 — for testModel + the "make e[s] from a[s]" task, the +pre-merge essential reactions are R2 (the only a[s]<->a[c] link) and R7 (the only +e[c]->e[s] producer); the alternative internal paths make nothing else essential. +""" +import cobra +from tinit_oracles import ( + TEST_MODEL_TASK_ESSENTIAL_PREMERGE, + make_test_model, + make_test_task, +) + +from raven_python.tasks import ( + EssentialReactionsResult, + Task, + find_task_essential_reactions, +) + + +def test_essential_reactions_match_oracle(): + res = find_task_essential_reactions(make_test_model(), [make_test_task()]) + assert isinstance(res, EssentialReactionsResult) + assert sorted(res.reactions) == TEST_MODEL_TASK_ESSENTIAL_PREMERGE # ['R2', 'R7'] + assert not res.failed_tasks + + +def test_essential_directions_are_forward(): + """R2 (a[s]->a[c]) and R7 (e[c]->e[s]) both carry positive flux for this task.""" + res = find_task_essential_reactions(make_test_model(), [make_test_task()]) + assert res.reactions == {"R2": 1, "R7": 1} + + +def test_task_metabolites_collected(): + """a[s] and e[s] are referenced by the task and must be protected from removal.""" + res = find_task_essential_reactions(make_test_model(), [make_test_task()]) + m = make_test_model() + names = {res_id: f"{m.metabolites.get_by_id(res_id).name}" + f"[{m.metabolites.get_by_id(res_id).compartment}]" for res_id in res.task_metabolites} + assert set(names.values()) == {"a[s]", "e[s]"} + + +def test_no_task_no_essentials(): + res = find_task_essential_reactions(make_test_model(), []) + assert res.reactions == {} and res.per_task == {} + + +def test_equation_metabolites_are_protected(): + """A task equation's metabolites count as task metabolites (protected from removal).""" + m = make_test_model() + task = Task( + id="equ", + inputs=[("a[s]", 0.0, 1000.0)], + outputs=[("e[c]", 1.0, 1.0)], + equations=[("a[c] => e[c]", 0.0, 1000.0)], # references a[c], which is not an I/O met + ) + res = find_task_essential_reactions(m, [task]) + names = {f"{m.metabolites.get_by_id(i).name}[{m.metabolites.get_by_id(i).compartment}]" + for i in res.task_metabolites} + assert {"a[c]", "e[c]"} <= names and "equ" not in res.failed_tasks + + +def test_infeasible_task_is_reported_failed(): + """A task requiring an impossible output is dropped, not crashed.""" + impossible = Task(id="bad", outputs=[("z[s]", 1.0, 1.0)]) + # z[s] doesn't exist -> unknown metabolite -> failed. + res = find_task_essential_reactions(make_test_model(), [impossible]) + assert res.failed_tasks == ["bad"] and res.reactions == {} + + +def test_should_fail_task_defines_no_essentials(): + res = find_task_essential_reactions( + make_test_model(), [Task(id="sf", should_fail=True, outputs=[("e[s]", 1.0, 1.0)])] + ) + assert res.reactions == {} and "sf" not in res.per_task + + +def test_direction_majority_across_tasks(): + """A reaction essential reverse in two tasks and forward in one is recorded reverse.""" + # Build a tiny model where a single reaction must run in a chosen direction. + m = cobra.Model("dir") + a, b = (cobra.Metabolite(x, name=x, compartment="s") for x in "ab") + m.add_metabolites([a, b]) + r = cobra.Reaction("REV", lower_bound=-1000, upper_bound=1000) + r.add_metabolites({a: -1, b: 1}) # a <=> b + m.add_reactions([r]) + m.objective = "REV" + # Task forcing net production of b from a -> REV forward (+1). + fwd = Task(id="fwd", inputs=[("a[s]", 0.0, 1000.0)], outputs=[("b[s]", 1.0, 1.0)]) + # Two tasks forcing net production of a from b -> REV reverse (-1). Distinct ids + # (task lists have unique ids; essential discovery de-duplicates by id). + rev1 = Task(id="rev1", inputs=[("b[s]", 0.0, 1000.0)], outputs=[("a[s]", 1.0, 1.0)]) + rev2 = Task(id="rev2", inputs=[("b[s]", 0.0, 1000.0)], outputs=[("a[s]", 1.0, 1.0)]) + res = find_task_essential_reactions(m, [rev1, rev2, fwd]) + assert res.reactions["REV"] == -1 # two reverse votes beat one forward + + +def test_duplicate_name_comp_metabolites_both_constrained(): + """A task referencing a name[comp] shared by two metabolites resolves (not 'missing').""" + m = cobra.Model("dup") + # Two distinct metabolites with the SAME name and compartment. + a1 = cobra.Metabolite("a1", name="a", compartment="s") + a2 = cobra.Metabolite("a2", name="a", compartment="s") + b = cobra.Metabolite("b", name="b", compartment="s") + m.add_metabolites([a1, a2, b]) + r1 = cobra.Reaction("R1", lower_bound=0, upper_bound=1000) + r1.add_metabolites({a1: -1, b: 1}) # only a1 feeds b + m.add_reactions([r1]) + m.objective = "R1" + # Output b from input a -> 'a[s]' matches both a1 and a2; must not be reported missing. + task = Task(id="t", inputs=[("a[s]", 0.0, 1000.0)], outputs=[("b[s]", 1.0, 1.0)]) + res = find_task_essential_reactions(m, [task]) + assert res.failed_tasks == [] # 'a[s]' resolved (to both a1 and a2), task feasible + assert "R1" in res.reactions diff --git a/tests/test_utils_balance.py b/tests/test_utils_balance.py new file mode 100644 index 0000000..aa1e47e --- /dev/null +++ b/tests/test_utils_balance.py @@ -0,0 +1,76 @@ +"""Tests for get_elemental_balance (getElementalBalance port).""" +import cobra +import pytest + +from raven_python.utils import ElementalBalance, get_elemental_balance + + +@pytest.fixture +def model(): + m = cobra.Model("t") + m.add_metabolites( + [ + cobra.Metabolite("a_c", formula="C6H12O6", charge=0, compartment="c"), + cobra.Metabolite("b_c", formula="C6H12O6", charge=0, compartment="c"), + cobra.Metabolite("c_c", formula="C3H6O3", charge=0, compartment="c"), + cobra.Metabolite("n_c", compartment="c"), # no formula + ] + ) + r_bal = cobra.Reaction("R_bal") + m.add_reactions([r_bal]) + r_bal.build_reaction_from_string("a_c --> b_c") # C6H12O6 -> C6H12O6 + r_unbal = cobra.Reaction("R_unbal") + m.add_reactions([r_unbal]) + r_unbal.build_reaction_from_string("a_c --> c_c") # C6H12O6 -> C3H6O3 + r_unknown = cobra.Reaction("R_unknown") + m.add_reactions([r_unknown]) + r_unknown.build_reaction_from_string("a_c --> n_c") # n_c has no formula + return m + + +def test_balanced(model): + (res,) = get_elemental_balance(model, ["R_bal"]) + assert res == ElementalBalance("R_bal", "balanced", {}) + + +def test_unbalanced_reports_imbalance(model): + (res,) = get_elemental_balance(model, ["R_unbal"]) + assert res.status == "unbalanced" + # products - reactants: C3H6O3 - C6H12O6 = -C3H6O3 + assert res.imbalance == {"C": -3.0, "H": -6.0, "O": -3.0} + + +def test_missing_formula_is_unknown_not_silently_wrong(model): + # cobra's check_mass_balance alone would silently report an imbalance here; + # we flag it as unknown instead. + (res,) = get_elemental_balance(model, ["R_unknown"]) + assert res.status == "unknown" + assert res.imbalance == {} + + +def test_all_reactions_default(model): + results = get_elemental_balance(model) + assert {r.reaction_id: r.status for r in results} == { + "R_bal": "balanced", + "R_unbal": "unbalanced", + "R_unknown": "unknown", + } + + +def test_charge_excluded(model): + # give a charge imbalance but keep elements balanced -> still "balanced" + model.metabolites.get_by_id("b_c").charge = 1 + (res,) = get_elemental_balance(model, ["R_bal"]) + assert res.status == "balanced" + + +# --- regression: empty reaction → unknown (known_issues.md F5) ------------- + +def test_empty_reaction_is_unknown(model): + """A reaction with no metabolites used to be reported `balanced` + vacuously (any() over an empty list is False and check_mass_balance + returns no imbalance). Now reports `unknown`.""" + empty = cobra.Reaction("R_empty", lower_bound=0, upper_bound=1000) + model.add_reactions([empty]) + (res,) = get_elemental_balance(model, ["R_empty"]) + assert res.status == "unknown" diff --git a/tests/test_utils_gpr.py b/tests/test_utils_gpr.py new file mode 100644 index 0000000..275d020 --- /dev/null +++ b/tests/test_utils_gpr.py @@ -0,0 +1,84 @@ +"""Tests for raven_python.utils.gpr (GPR linting).""" +import cobra +import pytest + +from raven_python.utils import GPRIssue, find_non_dnf_grrules, is_dnf + + +@pytest.mark.parametrize( + "rule", + [ + "", + "G1", + "G1 and G2", + "G1 or G2", + "G1 and G2 and G3", + "G1 or G2 or G3", + "(G1 and G2) or G3", + "(G1 and G2) or (G3 and G4)", + "G1 or (G2 and G3)", + ], +) +def test_is_dnf_true(rule): + assert is_dnf(rule) is True + + +@pytest.mark.parametrize( + "rule", + [ + "(G1 or G2) and G3", + "G1 and (G2 or G3)", + "(G1 or G2) and (G3 or G4)", + "G1 and (G2 or (G3 and G4))", + ], +) +def test_is_dnf_false(rule): + assert is_dnf(rule) is False + + +def test_is_dnf_accepts_gpr_and_none(): + from cobra.core.gene import GPR + + assert is_dnf(GPR.from_string("(G1 or G2) and G3")) is False + assert is_dnf(GPR.from_string("G1 or G2")) is True + assert is_dnf(None) is True + + +def test_is_dnf_independent_of_formatting(): + # cobra normalises on assignment, so casing/whitespace cannot change the verdict. + assert is_dnf("(G1 OR G2) AND G3") is False + assert is_dnf("( G1 and G2 ) or G3") is True + + +def _model_with_rules(rules: dict[str, str]) -> cobra.Model: + model = cobra.Model("t") + model.add_reactions([cobra.Reaction(rid) for rid in rules]) + for rid, rule in rules.items(): + model.reactions.get_by_id(rid).gene_reaction_rule = rule + return model + + +def test_find_non_dnf_grrules_flags_only_offenders(): + model = _model_with_rules( + { + "R_ok_single": "G1", + "R_ok_complex": "G1 and G2", + "R_ok_dnf": "(G1 and G2) or G3", + "R_no_gpr": "", + "R_bad_1": "(G1 or G2) and G3", + "R_bad_2": "(G1 or G2) and (G3 or G4)", + } + ) + + issues = find_non_dnf_grrules(model) + + assert [i.reaction_id for i in issues] == ["R_bad_1", "R_bad_2"] + assert all(isinstance(i, GPRIssue) for i in issues) + assert all("disjunctive normal form" in i.reason for i in issues) + # the reported GPR is the cobra-normalised string + assert issues[0].gpr == "(G1 or G2) and G3" + + +def test_find_non_dnf_grrules_empty_when_all_clean(): + model = _model_with_rules({"R1": "G1 or G2", "R2": "(G1 and G2) or G3"}) + assert find_non_dnf_grrules(model) == [] diff --git a/tests/test_utils_sort.py b/tests/test_utils_sort.py new file mode 100644 index 0000000..18bca24 --- /dev/null +++ b/tests/test_utils_sort.py @@ -0,0 +1,42 @@ +"""Tests for sort_identifiers and write_yaml_model(sort_ids=True).""" +import cobra + +from raven_python.io import read_yaml_model, write_yaml_model +from raven_python.manipulation import add_reactions_from_equations +from raven_python.utils import sort_identifiers + + +def _model(): + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("b_c", "a_c")]) + add_reactions_from_equations( + m, + [ + {"id": "R2", "equation": "a_c --> b_c", "gene_reaction_rule": "GB"}, + {"id": "R1", "equation": "b_c --> a_c", "gene_reaction_rule": "GA"}, + ], + ) + return m + + +def test_sort_identifiers_orders_everything(): + m = _model() + sort_identifiers(m) + assert [r.id for r in m.reactions] == ["R1", "R2"] + assert [x.id for x in m.metabolites] == ["a_c", "b_c"] + assert [g.id for g in m.genes] == ["GA", "GB"] + # lookup index still intact after sorting + assert m.reactions.get_by_id("R2").id == "R2" + + +def test_write_yaml_sort_ids_does_not_mutate(tmp_path): + m = _model() + order_before = [r.id for r in m.reactions] + out = tmp_path / "m.yml" + write_yaml_model(m, out, sort_ids=True) + assert [r.id for r in m.reactions] == order_before # model untouched + # but the file is sorted + text = out.read_text() + assert text.index("R1") < text.index("R2") + reloaded = read_yaml_model(out) + assert [r.id for r in reloaded.reactions] == ["R1", "R2"] diff --git a/tests/test_utils_validate.py b/tests/test_utils_validate.py new file mode 100644 index 0000000..2d38e6f --- /dev/null +++ b/tests/test_utils_validate.py @@ -0,0 +1,80 @@ +"""Tests for check_model (the surviving checks of checkModelStruct).""" +import cobra +import pytest + +from raven_python.manipulation import add_reactions_from_equations +from raven_python.utils import ModelIssue, check_model + + +def _categories(issues, category): + return [i.object_id for i in issues if i.category == category] + + +@pytest.fixture +def model(): + m = cobra.Model("t") + m.add_metabolites( + [ + cobra.Metabolite("a_c", name="A", compartment="c"), + cobra.Metabolite("b_c", name="B", compartment="c"), + ] + ) + add_reactions_from_equations( + m, [{"id": "R1", "equation": "a_c --> b_c", "gene_reaction_rule": "G1"}] + ) + m.reactions.get_by_id("R1").objective_coefficient = 1 + return m + + +def test_clean_model_has_no_issues(model): + assert check_model(model) == [] + + +def test_orphan_metabolite(model): + model.add_metabolites([cobra.Metabolite("orphan_c", name="Orphan", compartment="c")]) + assert "orphan_c" in _categories(check_model(model), "orphan_metabolite") + + +def test_orphan_gene(model): + model.genes.append(cobra.core.gene.Gene("G_lonely")) + assert "G_lonely" in _categories(check_model(model), "orphan_gene") + + +def test_empty_reaction(model): + model.add_reactions([cobra.Reaction("R_empty")]) + assert "R_empty" in _categories(check_model(model), "empty_reaction") + + +def test_empty_metabolite_name(model): + model.add_metabolites([cobra.Metabolite("noname_c", compartment="c")]) + # also an orphan, but we check the name category specifically + assert "noname_c" in _categories(check_model(model), "empty_metabolite_name") + + +def test_duplicate_name_compartment(model): + # second metabolite named "A" in compartment c + dup = cobra.Metabolite("a2_c", name="A", compartment="c") + model.add_metabolites([dup]) + model.reactions.get_by_id("R1").add_metabolites({dup: -1}) # keep it used + issues = [i for i in check_model(model) if i.category == "duplicate_name_compartment"] + assert len(issues) == 1 + assert "a_c" in issues[0].message and "a2_c" in issues[0].message + + +def test_no_objective(model): + model.reactions.get_by_id("R1").objective_coefficient = 0 + cats = [i.category for i in check_model(model)] + assert "objective" in cats + + +def test_multiple_objectives(model): + add_reactions_from_equations(model, [{"id": "R2", "equation": "b_c --> a_c"}]) + model.reactions.get_by_id("R2").objective_coefficient = 1 + obj_issues = [i for i in check_model(model) if i.category == "objective"] + assert len(obj_issues) == 1 + assert "Multiple" in obj_issues[0].message + + +def test_returns_model_issue_instances(model): + model.add_reactions([cobra.Reaction("R_empty")]) + assert all(isinstance(i, ModelIssue) for i in check_model(model)) diff --git a/tests/tinit_oracles.py b/tests/tinit_oracles.py new file mode 100644 index 0000000..638956d --- /dev/null +++ b/tests/tinit_oracles.py @@ -0,0 +1,161 @@ +"""Shared (ft)INIT test oracles, ported from RAVEN's ``tinitTests.m``. + +These toy models have **defined reaction scores** and **known ftINIT outputs**, so +they serve as exact correctness oracles for the Phase 4d port (see +docs/ftinit_review_and_plan.md). Building them here once lets every sub-phase +(essential-reaction discovery, the MILP, linear merge, staging) check against the +same RAVEN-verified answers. + +Reaction scores are injected through gene expression using :func:`expr_for_rxn_score` +(RAVEN's ``getExprForRxnScore``): each toy reaction ``Ri`` has at most one gene +``Gi``, so an expression of ``exp(score_i/5)`` reproduces the desired score exactly +(no-gene reactions get ``no_gene_score = -2`` regardless). +""" +from __future__ import annotations + +import math + +import cobra + + +def expr_for_rxn_score(scores, threshold: float = 1.0) -> dict: + """RAVEN ``getExprForRxnScore``: gene expression giving a target single-gene score. + + Inverts ``score = 5·ln(level/threshold)`` → ``level = threshold·exp(score/5)``. + Returns ``{Gi: level}`` for i = 1..len(scores) (gene name ``"G{i}"``), mirroring the + 1-reaction-1-gene layout of the toy models. + """ + return {f"G{i + 1}": threshold * math.exp(s / 5) for i, s in enumerate(scores)} + + +def _build(model_id, mets, reactions, objective): + """mets: {id: (name, compartment)}; reactions: {id: (stoich, lb, ub, gpr)}.""" + m = cobra.Model(model_id) + met_objs = { + mid: cobra.Metabolite(mid, name=name, compartment=comp) + for mid, (name, comp) in mets.items() + } + m.add_metabolites(list(met_objs.values())) + for rid, (stoich, lb, ub, gpr) in reactions.items(): + r = cobra.Reaction(rid, lower_bound=lb, upper_bound=ub) + r.add_metabolites({met_objs[mid]: coeff for mid, coeff in stoich.items()}) + m.add_reactions([r]) + if gpr: + r.gene_reaction_rule = gpr + m.objective = objective + return m + + +# --------------------------------------------------------------------------- # +# testModel — RAVEN getTstModel(): 8 mets, 10 rxns. a[s] -> ... -> e[s] export. +# --------------------------------------------------------------------------- # +_TEST_METS = { + "as": ("a", "s"), "ac": ("a", "c"), "bc": ("b", "c"), "cc": ("c", "c"), + "dc": ("d", "c"), "ec": ("e", "c"), "es": ("e", "s"), "fc": ("f", "c"), +} +_TEST_RXNS = { + "R1": ({"as": 1}, 0, 1000, ""), # -> a[s] (exchange, no GPR) + "R2": ({"as": -1, "ac": 1}, -1000, 1000, ""), # a[s] <=> a[c] (transport, no GPR) + "R3": ({"ac": -1, "bc": 1, "cc": 1}, -1000, 1000, "G3"), + "R4": ({"ac": -1, "dc": 2}, -1000, 1000, "G4"), + "R5": ({"bc": -1, "cc": -1, "ec": 1}, 0, 1000, "G5"), + "R6": ({"dc": -2, "ec": 1}, 0, 1000, "G6"), + "R7": ({"ec": -1, "es": 1}, 0, 1000, "G7"), # transport, with GPR + "R8": ({"es": -1}, 0, 1000, ""), # e[s] -> (exchange, no GPR) + "R9": ({"ac": -1, "fc": 1}, -1000, 1000, "G9"), + "R10": ({"fc": -1, "ec": 1}, -1000, 1000, "G10"), +} +# RAVEN getTstModelRxnScores(), R1..R10. +TEST_MODEL_SCORES = [-2, -2, -1, 7, 0.5, 0.5, -1, -2, -3, 3.5] + + +def make_test_model() -> cobra.Model: + return _build("testModel", _TEST_METS, _TEST_RXNS, "R8") + + +# Oracles (RAVEN tinitTests): +# T0001 ftINIT, no tasks, default '1+1': +TEST_MODEL_FTINIT_NO_TASKS = ["R1", "R4", "R6", "R8", "R9", "R10"] +# T0001 with R7,R10 spontaneous: +TEST_MODEL_FTINIT_SPONT_R7_R10 = ["R1", "R2", "R4", "R6", "R7", "R8"] +# T0002 with task "gen e[s] from a[s]": essential rxns (pre-merge ids) and output: +TEST_MODEL_TASK_ESSENTIAL_PREMERGE = ["R2", "R7"] +TEST_MODEL_TASK_ESSENTIAL_MERGED = ["R1", "R7"] +TEST_MODEL_FTINIT_WITH_TASK = ["R1", "R2", "R4", "R6", "R7", "R8", "R9", "R10"] +# T0004 mergeLinear(testModel): merges {R1,R2},{R3,R5},{R4,R6},{R7,R8},{R9,R10} +TEST_MODEL_GROUP_IDS = [1, 1, 2, 3, 2, 3, 4, 4, 5, 5] +TEST_MODEL_MERGED_REV = [0, 0, 0, 0, 1] +TEST_MODEL_MERGED_LB = [0, 0, 0, 0, -1000] +# groupRxnScores with R1,R2,R8 zeroed (toIgnore): -> per merged group +TEST_MODEL_GROUPED_SCORES = [0, -0.5, 7.5, -1, 0.5] + + +# The task: generate e[s] from a[s] (RAVEN getTstModelTasks()). +def make_test_task(): + """RAVEN getTstModelTasks(): make e[s] from a[s].""" + from raven_python.tasks import Task + + return Task( + id="Gen e[s] from a[s]", + description="Gen e[s] from a[s]", + inputs=[("a[s]", 0.0, math.inf)], # (token, LBin, UBin) + outputs=[("e[s]", 1.0, 1.0)], # (token, LBout, UBout) + ) + + +# --------------------------------------------------------------------------- # +# testModel4 — RAVEN getTstModel4(): partial linear merges + flips. +# --------------------------------------------------------------------------- # +_TEST4_METS = { + "a": ("a", "s"), "b": ("b", "s"), "d": ("d", "s"), "e": ("e", "s"), + "f": ("f", "s"), "g": ("g", "s"), "h": ("h", "s"), +} +_TEST4_RXNS = { + "R1": ({"a": -1}, -1000, 1000, "G1"), # a[s] <=> + "R2": ({"a": -1, "b": 1}, 0, 1000, "G2"), # a[s] -> b[s] + "R3": ({"a": -1, "b": 1}, -1000, 1000, "G3"), # a[s] <=> b[s] + "R4": ({"b": -1}, 0, 1000, "G4"), # b[s] -> + "R5": ({"a": -5, "d": 5}, -1000, 1000, "G5"), # 5 a[s] <=> 5 d[s] + "R6": ({"e": -1, "d": 1}, -1000, 1000, "G6"), # e[s] <=> d[s] + "R7": ({"f": -1, "g": -1, "e": 1}, -1000, 1000, "G7"), # f[s]+g[s] <=> e[s] + "R8": ({"b": -1, "f": 1}, -1000, 1000, "G8"), # b[s] <=> f[s] + "R9": ({"h": -1, "g": 1}, -1000, 1000, "G9"), # h[s] <=> g[s] + "R10": ({"h": -1}, 0, 1000, "G10"), # h[s] -> + "R11": ({"e": -1, "g": 1}, 0, 1000, "G11"), # e[s] -> g[s] +} +TEST_MODEL4_SCORES = [-1, -1, 2, -1, 0.5, -2, 1, 1.3, -0.5, -0.4, 8] +# T0004 mergeLinear(testModel4): merges {R5,R6},{R7,R8},{R9,R10}; rest unmerged. +TEST_MODEL4_GROUP_IDS = [0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 0] +TEST_MODEL4_MERGED_REV = [1, 0, 1, 0, 1, 1, 0, 0] +TEST_MODEL4_REVERSED_RXNS = ["R6", "R9"] # flipped direction when made irreversible + + +def make_test_model4() -> cobra.Model: + return _build("testModel4", _TEST4_METS, _TEST4_RXNS, "R4") + + +# --------------------------------------------------------------------------- # +# testModel5 — RAVEN getTstModel5(): testModel + an unmerged parallel path R11-R14. +# --------------------------------------------------------------------------- # +def make_test_model5() -> cobra.Model: + m = make_test_model() + m.id = "testModel5" + m.add_metabolites([cobra.Metabolite("gc", name="g", compartment="c")]) + gc = m.metabolites.get_by_id("gc") + ac = m.metabolites.get_by_id("ac") + ec = m.metabolites.get_by_id("ec") + extra = { + "R11": ({ac: -1, gc: 1}, -1000, 1000, "G11"), + "R12": ({ac: -1, gc: 1}, -1000, 1000, "G12"), + "R13": ({gc: -1, ec: 1}, -1000, 1000, "G13"), + "R14": ({gc: -1, ec: 1}, -1000, 1000, "G14"), + } + for rid, (stoich, lb, ub, gpr) in extra.items(): + r = cobra.Reaction(rid, lower_bound=lb, upper_bound=ub) + r.add_metabolites(stoich) + m.add_reactions([r]) + r.gene_reaction_rule = gpr + return m + + +TEST_MODEL5_SCORES = [*TEST_MODEL_SCORES, -1, -1.5, -1, -1.5]