From 0a9f4a1f20f1503b551273ffdbba2a25a00e9f72 Mon Sep 17 00:00:00 2001 From: Eduard Kerkhoven Date: Fri, 29 May 2026 22:45:40 +0200 Subject: [PATCH 1/7] Project scaffold: pyproject + package skeleton + README + LICENSE --- LICENSE | 541 +++++++++++++++++++++++++++++++++++ README.md | 93 +++++- pyproject.toml | 84 ++++++ src/raven_python/__init__.py | 10 + 4 files changed, 715 insertions(+), 13 deletions(-) create mode 100644 LICENSE create mode 100644 pyproject.toml create mode 100644 src/raven_python/__init__.py diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..1f44cf3 --- /dev/null +++ b/LICENSE @@ -0,0 +1,541 @@ +GNU GENERAL PUBLIC LICENSE +========================== + +Version 3, 29 June 2007 + +Copyright © 2007 Free Software Foundation, Inc. <> + +Everyone is permitted to copy and distribute verbatim copies of this license +document, but changing it is not allowed. + +## Preamble + +The GNU General Public License is a free, copyleft license for software and other +kinds of works. + +The licenses for most software and other practical works are designed to take away +your freedom to share and change the works. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change all versions of a +program--to make sure it remains free software for all its users. We, the Free +Software Foundation, use the GNU General Public License for most of our software; it +applies also to any other work released this way by its authors. You can apply it to +your programs, too. + +When we speak of free software, we are referring to freedom, not price. Our General +Public Licenses are designed to make sure that you have the freedom to distribute +copies of free software (and charge for them if you wish), that you receive source +code or can get it if you want it, that you can change the software or use pieces of +it in new free programs, and that you know you can do these things. + +To protect your rights, we need to prevent others from denying you these rights or +asking you to surrender the rights. Therefore, you have certain responsibilities if +you distribute copies of the software, or if you modify it: responsibilities to +respect the freedom of others. + +For example, if you distribute copies of such a program, whether gratis or for a fee, +you must pass on to the recipients the same freedoms that you received. You must make +sure that they, too, receive or can get the source code. And you must show them these +terms so they know their rights. + +Developers that use the GNU GPL protect your rights with two steps: (1) assert +copyright on the software, and (2) offer you this License giving you legal permission +to copy, distribute and/or modify it. + +For the developers' and authors' protection, the GPL clearly explains that there is +no warranty for this free software. For both users' and authors' sake, the GPL +requires that modified versions be marked as changed, so that their problems will not +be attributed erroneously to authors of previous versions. + +Some devices are designed to deny users access to install or run modified versions of +the software inside them, although the manufacturer can do so. This is fundamentally +incompatible with the aim of protecting users' freedom to change the software. The +systematic pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we have designed +this version of the GPL to prohibit the practice for those products. If such problems +arise substantially in other domains, we stand ready to extend this provision to +those domains in future versions of the GPL, as needed to protect the freedom of +users. + +Finally, every program is threatened constantly by software patents. States should +not allow patents to restrict development and use of software on general-purpose +computers, but in those that do, we wish to avoid the special danger that patents +applied to a free program could make it effectively proprietary. To prevent this, the +GPL assures that patents cannot be used to render the program non-free. + +The precise terms and conditions for copying, distribution and modification follow. + +## TERMS AND CONDITIONS + +### 0. Definitions. + +“This License” refers to version 3 of the GNU General Public License. + +“Copyright” also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + +“The Program” refers to any copyrightable work licensed under this +License. Each licensee is addressed as “you”. “Licensees” and +“recipients” may be individuals or organizations. + +To “modify” a work means to copy from or adapt all or part of the work in +a fashion requiring copyright permission, other than the making of an exact copy. The +resulting work is called a “modified version” of the earlier work or a +work “based on” the earlier work. + +A “covered work” means either the unmodified Program or a work based on +the Program. + +To “propagate” a work means to do anything with it that, without +permission, would make you directly or secondarily liable for infringement under +applicable copyright law, except executing it on a computer or modifying a private +copy. Propagation includes copying, distribution (with or without modification), +making available to the public, and in some countries other activities as well. + +To “convey” a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through a computer +network, with no transfer of a copy, is not conveying. + +An interactive user interface displays “Appropriate Legal Notices” to the +extent that it includes a convenient and prominently visible feature that (1) +displays an appropriate copyright notice, and (2) tells the user that there is no +warranty for the work (except to the extent that warranties are provided), that +licensees may convey the work under this License, and how to view a copy of this +License. If the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + +### 1. Source Code. + +The “source code” for a work means the preferred form of the work for +making modifications to it. “Object code” means any non-source form of a +work. + +A “Standard Interface” means an interface that either is an official +standard defined by a recognized standards body, or, in the case of interfaces +specified for a particular programming language, one that is widely used among +developers working in that language. + +The “System Libraries” of an executable work include anything, other than +the work as a whole, that (a) is included in the normal form of packaging a Major +Component, but which is not part of that Major Component, and (b) serves only to +enable use of the work with that Major Component, or to implement a Standard +Interface for which an implementation is available to the public in source code form. +A “Major Component”, in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system (if any) on which +the executable work runs, or a compiler used to produce the work, or an object code +interpreter used to run it. + +The “Corresponding Source” for a work in object code form means all the +source code needed to generate, install, and (for an executable work) run the object +code and to modify the work, including scripts to control those activities. However, +it does not include the work's System Libraries, or general-purpose tools or +generally available free programs which are used unmodified in performing those +activities but which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for the work, and +the source code for shared libraries and dynamically linked subprograms that the work +is specifically designed to require, such as by intimate data communication or +control flow between those subprograms and other parts of the work. + +The Corresponding Source need not include anything that users can regenerate +automatically from other parts of the Corresponding Source. + +The Corresponding Source for a work in source code form is that same work. + +### 2. Basic Permissions. + +All rights granted under this License are granted for the term of copyright on the +Program, and are irrevocable provided the stated conditions are met. This License +explicitly affirms your unlimited permission to run the unmodified Program. The +output from running a covered work is covered by this License only if the output, +given its content, constitutes a covered work. This License acknowledges your rights +of fair use or other equivalent, as provided by copyright law. + +You may make, run and propagate covered works that you do not convey, without +conditions so long as your license otherwise remains in force. You may convey covered +works to others for the sole purpose of having them make modifications exclusively +for you, or provide you with facilities for running those works, provided that you +comply with the terms of this License in conveying all material for which you do not +control copyright. Those thus making or running the covered works for you must do so +exclusively on your behalf, under your direction and control, on terms that prohibit +them from making any copies of your copyrighted material outside their relationship +with you. + +Conveying under any other circumstances is permitted solely under the conditions +stated below. Sublicensing is not allowed; section 10 makes it unnecessary. + +### 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + +No covered work shall be deemed part of an effective technological measure under any +applicable law fulfilling obligations under article 11 of the WIPO copyright treaty +adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention +of such measures. + +When you convey a covered work, you waive any legal power to forbid circumvention of +technological measures to the extent such circumvention is effected by exercising +rights under this License with respect to the covered work, and you disclaim any +intention to limit operation or modification of the work as a means of enforcing, +against the work's users, your or third parties' legal rights to forbid circumvention +of technological measures. + +### 4. Conveying Verbatim Copies. + +You may convey verbatim copies of the Program's source code as you receive it, in any +medium, provided that you conspicuously and appropriately publish on each copy an +appropriate copyright notice; keep intact all notices stating that this License and +any non-permissive terms added in accord with section 7 apply to the code; keep +intact all notices of the absence of any warranty; and give all recipients a copy of +this License along with the Program. + +You may charge any price or no price for each copy that you convey, and you may offer +support or warranty protection for a fee. + +### 5. Conveying Modified Source Versions. + +You may convey a work based on the Program, or the modifications to produce it from +the Program, in the form of source code under the terms of section 4, provided that +you also meet all of these conditions: + +* **a)** The work must carry prominent notices stating that you modified it, and giving a +relevant date. +* **b)** The work must carry prominent notices stating that it is released under this +License and any conditions added under section 7. This requirement modifies the +requirement in section 4 to “keep intact all notices”. +* **c)** You must license the entire work, as a whole, under this License to anyone who +comes into possession of a copy. This License will therefore apply, along with any +applicable section 7 additional terms, to the whole of the work, and all its parts, +regardless of how they are packaged. This License gives no permission to license the +work in any other way, but it does not invalidate such permission if you have +separately received it. +* **d)** If the work has interactive user interfaces, each must display Appropriate Legal +Notices; however, if the Program has interactive interfaces that do not display +Appropriate Legal Notices, your work need not make them do so. + +A compilation of a covered work with other separate and independent works, which are +not by their nature extensions of the covered work, and which are not combined with +it such as to form a larger program, in or on a volume of a storage or distribution +medium, is called an “aggregate” if the compilation and its resulting +copyright are not used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work in an aggregate +does not cause this License to apply to the other parts of the aggregate. + +### 6. Conveying Non-Source Forms. + +You may convey a covered work in object code form under the terms of sections 4 and +5, provided that you also convey the machine-readable Corresponding Source under the +terms of this License, in one of these ways: + +* **a)** Convey the object code in, or embodied in, a physical product (including a +physical distribution medium), accompanied by the Corresponding Source fixed on a +durable physical medium customarily used for software interchange. +* **b)** Convey the object code in, or embodied in, a physical product (including a +physical distribution medium), accompanied by a written offer, valid for at least +three years and valid for as long as you offer spare parts or customer support for +that product model, to give anyone who possesses the object code either (1) a copy of +the Corresponding Source for all the software in the product that is covered by this +License, on a durable physical medium customarily used for software interchange, for +a price no more than your reasonable cost of physically performing this conveying of +source, or (2) access to copy the Corresponding Source from a network server at no +charge. +* **c)** Convey individual copies of the object code with a copy of the written offer to +provide the Corresponding Source. This alternative is allowed only occasionally and +noncommercially, and only if you received the object code with such an offer, in +accord with subsection 6b. +* **d)** Convey the object code by offering access from a designated place (gratis or for +a charge), and offer equivalent access to the Corresponding Source in the same way +through the same place at no further charge. You need not require recipients to copy +the Corresponding Source along with the object code. If the place to copy the object +code is a network server, the Corresponding Source may be on a different server +(operated by you or a third party) that supports equivalent copying facilities, +provided you maintain clear directions next to the object code saying where to find +the Corresponding Source. Regardless of what server hosts the Corresponding Source, +you remain obligated to ensure that it is available for as long as needed to satisfy +these requirements. +* **e)** Convey the object code using peer-to-peer transmission, provided you inform +other peers where the object code and Corresponding Source of the work are being +offered to the general public at no charge under subsection 6d. + +A separable portion of the object code, whose source code is excluded from the +Corresponding Source as a System Library, need not be included in conveying the +object code work. + +A “User Product” is either (1) a “consumer product”, which +means any tangible personal property which is normally used for personal, family, or +household purposes, or (2) anything designed or sold for incorporation into a +dwelling. In determining whether a product is a consumer product, doubtful cases +shall be resolved in favor of coverage. For a particular product received by a +particular user, “normally used” refers to a typical or common use of +that class of product, regardless of the status of the particular user or of the way +in which the particular user actually uses, or expects or is expected to use, the +product. A product is a consumer product regardless of whether the product has +substantial commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + +“Installation Information” for a User Product means any methods, +procedures, authorization keys, or other information required to install and execute +modified versions of a covered work in that User Product from a modified version of +its Corresponding Source. The information must suffice to ensure that the continued +functioning of the modified object code is in no case prevented or interfered with +solely because modification has been made. + +If you convey an object code work under this section in, or with, or specifically for +use in, a User Product, and the conveying occurs as part of a transaction in which +the right of possession and use of the User Product is transferred to the recipient +in perpetuity or for a fixed term (regardless of how the transaction is +characterized), the Corresponding Source conveyed under this section must be +accompanied by the Installation Information. But this requirement does not apply if +neither you nor any third party retains the ability to install modified object code +on the User Product (for example, the work has been installed in ROM). + +The requirement to provide Installation Information does not include a requirement to +continue to provide support service, warranty, or updates for a work that has been +modified or installed by the recipient, or for the User Product in which it has been +modified or installed. Access to a network may be denied when the modification itself +materially and adversely affects the operation of the network or violates the rules +and protocols for communication across the network. + +Corresponding Source conveyed, and Installation Information provided, in accord with +this section must be in a format that is publicly documented (and with an +implementation available to the public in source code form), and must require no +special password or key for unpacking, reading or copying. + +### 7. Additional Terms. + +“Additional permissions” are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. Additional +permissions that are applicable to the entire Program shall be treated as though they +were included in this License, to the extent that they are valid under applicable +law. If additional permissions apply only to part of the Program, that part may be +used separately under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + +When you convey a copy of a covered work, you may at your option remove any +additional permissions from that copy, or from any part of it. (Additional +permissions may be written to require their own removal in certain cases when you +modify the work.) You may place additional permissions on material, added by you to a +covered work, for which you have or can give appropriate copyright permission. + +Notwithstanding any other provision of this License, for material you add to a +covered work, you may (if authorized by the copyright holders of that material) +supplement the terms of this License with terms: + +* **a)** Disclaiming warranty or limiting liability differently from the terms of +sections 15 and 16 of this License; or +* **b)** Requiring preservation of specified reasonable legal notices or author +attributions in that material or in the Appropriate Legal Notices displayed by works +containing it; or +* **c)** Prohibiting misrepresentation of the origin of that material, or requiring that +modified versions of such material be marked in reasonable ways as different from the +original version; or +* **d)** Limiting the use for publicity purposes of names of licensors or authors of the +material; or +* **e)** Declining to grant rights under trademark law for use of some trade names, +trademarks, or service marks; or +* **f)** Requiring indemnification of licensors and authors of that material by anyone +who conveys the material (or modified versions of it) with contractual assumptions of +liability to the recipient, for any liability that these contractual assumptions +directly impose on those licensors and authors. + +All other non-permissive additional terms are considered “further +restrictions” within the meaning of section 10. If the Program as you received +it, or any part of it, contains a notice stating that it is governed by this License +along with a term that is a further restriction, you may remove that term. If a +license document contains a further restriction but permits relicensing or conveying +under this License, you may add to a covered work material governed by the terms of +that license document, provided that the further restriction does not survive such +relicensing or conveying. + +If you add terms to a covered work in accord with this section, you must place, in +the relevant source files, a statement of the additional terms that apply to those +files, or a notice indicating where to find the applicable terms. + +Additional terms, permissive or non-permissive, may be stated in the form of a +separately written license, or stated as exceptions; the above requirements apply +either way. + +### 8. Termination. + +You may not propagate or modify a covered work except as expressly provided under +this License. Any attempt otherwise to propagate or modify it is void, and will +automatically terminate your rights under this License (including any patent licenses +granted under the third paragraph of section 11). + +However, if you cease all violation of this License, then your license from a +particular copyright holder is reinstated (a) provisionally, unless and until the +copyright holder explicitly and finally terminates your license, and (b) permanently, +if the copyright holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + +Moreover, your license from a particular copyright holder is reinstated permanently +if the copyright holder notifies you of the violation by some reasonable means, this +is the first time you have received notice of violation of this License (for any +work) from that copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + +Termination of your rights under this section does not terminate the licenses of +parties who have received copies or rights from you under this License. If your +rights have been terminated and not permanently reinstated, you do not qualify to +receive new licenses for the same material under section 10. + +### 9. Acceptance Not Required for Having Copies. + +You are not required to accept this License in order to receive or run a copy of the +Program. Ancillary propagation of a covered work occurring solely as a consequence of +using peer-to-peer transmission to receive a copy likewise does not require +acceptance. However, nothing other than this License grants you permission to +propagate or modify any covered work. These actions infringe copyright if you do not +accept this License. Therefore, by modifying or propagating a covered work, you +indicate your acceptance of this License to do so. + +### 10. Automatic Licensing of Downstream Recipients. + +Each time you convey a covered work, the recipient automatically receives a license +from the original licensors, to run, modify and propagate that work, subject to this +License. You are not responsible for enforcing compliance by third parties with this +License. + +An “entity transaction” is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an organization, or +merging organizations. If propagation of a covered work results from an entity +transaction, each party to that transaction who receives a copy of the work also +receives whatever licenses to the work the party's predecessor in interest had or +could give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if the predecessor +has it or can get it with reasonable efforts. + +You may not impose any further restrictions on the exercise of the rights granted or +affirmed under this License. For example, you may not impose a license fee, royalty, +or other charge for exercise of rights granted under this License, and you may not +initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging +that any patent claim is infringed by making, using, selling, offering for sale, or +importing the Program or any portion of it. + +### 11. Patents. + +A “contributor” is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The work thus +licensed is called the contributor's “contributor version”. + +A contributor's “essential patent claims” are all patent claims owned or +controlled by the contributor, whether already acquired or hereafter acquired, that +would be infringed by some manner, permitted by this License, of making, using, or +selling its contributor version, but do not include claims that would be infringed +only as a consequence of further modification of the contributor version. For +purposes of this definition, “control” includes the right to grant patent +sublicenses in a manner consistent with the requirements of this License. + +Each contributor grants you a non-exclusive, worldwide, royalty-free patent license +under the contributor's essential patent claims, to make, use, sell, offer for sale, +import and otherwise run, modify and propagate the contents of its contributor +version. + +In the following three paragraphs, a “patent license” is any express +agreement or commitment, however denominated, not to enforce a patent (such as an +express permission to practice a patent or covenant not to sue for patent +infringement). To “grant” such a patent license to a party means to make +such an agreement or commitment not to enforce a patent against the party. + +If you convey a covered work, knowingly relying on a patent license, and the +Corresponding Source of the work is not available for anyone to copy, free of charge +and under the terms of this License, through a publicly available network server or +other readily accessible means, then you must either (1) cause the Corresponding +Source to be so available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner consistent with +the requirements of this License, to extend the patent license to downstream +recipients. “Knowingly relying” means you have actual knowledge that, but +for the patent license, your conveying the covered work in a country, or your +recipient's use of the covered work in a country, would infringe one or more +identifiable patents in that country that you have reason to believe are valid. + +If, pursuant to or in connection with a single transaction or arrangement, you +convey, or propagate by procuring conveyance of, a covered work, and grant a patent +license to some of the parties receiving the covered work authorizing them to use, +propagate, modify or convey a specific copy of the covered work, then the patent +license you grant is automatically extended to all recipients of the covered work and +works based on it. + +A patent license is “discriminatory” if it does not include within the +scope of its coverage, prohibits the exercise of, or is conditioned on the +non-exercise of one or more of the rights that are specifically granted under this +License. You may not convey a covered work if you are a party to an arrangement with +a third party that is in the business of distributing software, under which you make +payment to the third party based on the extent of your activity of conveying the +work, and under which the third party grants, to any of the parties who would receive +the covered work from you, a discriminatory patent license (a) in connection with +copies of the covered work conveyed by you (or copies made from those copies), or (b) +primarily for and in connection with specific products or compilations that contain +the covered work, unless you entered into that arrangement, or that patent license +was granted, prior to 28 March 2007. + +Nothing in this License shall be construed as excluding or limiting any implied +license or other defenses to infringement that may otherwise be available to you +under applicable patent law. + +### 12. No Surrender of Others' Freedom. + +If conditions are imposed on you (whether by court order, agreement or otherwise) +that contradict the conditions of this License, they do not excuse you from the +conditions of this License. If you cannot convey a covered work so as to satisfy +simultaneously your obligations under this License and any other pertinent +obligations, then as a consequence you may not convey it at all. For example, if you +agree to terms that obligate you to collect a royalty for further conveying from +those to whom you convey the Program, the only way you could satisfy both those terms +and this License would be to refrain entirely from conveying the Program. + +### 13. Use with the GNU Affero General Public License. + +Notwithstanding any other provision of this License, you have permission to link or +combine any covered work with a work licensed under version 3 of the GNU Affero +General Public License into a single combined work, and to convey the resulting work. +The terms of this License will continue to apply to the part which is the covered +work, but the special requirements of the GNU Affero General Public License, section +13, concerning interaction through a network will apply to the combination as such. + +### 14. Revised Versions of this License. + +The Free Software Foundation may publish revised and/or new versions of the GNU +General Public License from time to time. Such new versions will be similar in spirit +to the present version, but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Program specifies that +a certain numbered version of the GNU General Public License “or any later +version” applies to it, you have the option of following the terms and +conditions either of that numbered version or of any later version published by the +Free Software Foundation. If the Program does not specify a version number of the GNU +General Public License, you may choose any version ever published by the Free +Software Foundation. + +If the Program specifies that a proxy can decide which future versions of the GNU +General Public License can be used, that proxy's public statement of acceptance of a +version permanently authorizes you to choose that version for the Program. + +Later license versions may give you additional or different permissions. However, no +additional obligations are imposed on any author or copyright holder as a result of +your choosing to follow a later version. + +### 15. Disclaimer of Warranty. + +THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM “AS IS” WITHOUT WARRANTY OF ANY KIND, EITHER +EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE +QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE +DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + +### 16. Limitation of Liability. + +IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY +COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS +PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, +INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE +OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE +WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + +### 17. Interpretation of Sections 15 and 16. + +If the disclaimer of warranty and limitation of liability provided above cannot be +given local legal effect according to their terms, reviewing courts shall apply local +law that most closely approximates an absolute waiver of all civil liability in +connection with the Program, unless a warranty or assumption of liability accompanies +a copy of the Program in return for a fee. \ No newline at end of file diff --git a/README.md b/README.md index c8f3d64..69111be 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,82 @@ # raven-python -The Python counterpart of the -[RAVEN Toolbox 2](https://github.com/SysBioChalmers/RAVEN) (MATLAB), built on -[cobrapy](https://github.com/opencobra/cobrapy). - -`raven-python` covers de-novo reconstruction (KEGG + protein homology), -context-specific model extraction (`tINIT` / `ftINIT`), metabolic-task -validation, gap-filling, omics ingestion, sub-cellular localisation, model -manipulation, and YAML / SIF / Excel I/O — preserving the established RAVEN -workflows in a Python-native form. - -This `main` branch is intentionally empty. Development happens on the -`develop` branch via a series of feature branches; see the open and merged -pull requests for the current state of the port. +[![CI](https://github.com/SysBioChalmers/raven-python/actions/workflows/ci.yml/badge.svg)](https://github.com/SysBioChalmers/raven-python/actions/workflows/ci.yml) + +**Reconstruction, Analysis and Visualisation of Metabolic Networks — in Python.** + +`raven-python` is the Python counterpart of the +[RAVEN Toolbox 2](https://github.com/SysBioChalmers/RAVEN) (MATLAB). It builds on +[**cobrapy**](https://github.com/opencobra/cobrapy) for everything cobrapy already does +well (simulation, standard analyses, SBML I/O, model manipulation) and adds the +functionality that's unique to RAVEN: + +* **De novo reconstruction** from KEGG and protein homology (BLAST / DIAMOND). +* **Context-specific models** from omics data via **tINIT / ftINIT**, with task-aware + gap-filling and the linear-merge MILP reduction. +* **Metabolic-task** validation (`check_tasks`, `fitTasks`). +* **Connectivity gap-filling** against template models. +* **Omics integration** — Human Protein Atlas (proteomics + RNA-seq) ingestion. +* **Sub-cellular localisation** prediction by MILP, with partial-update mode and + pluggable predictors (WoLF PSORT, DeepLoc, …). +* **N-model comparison**; **reporter metabolites**; **FSEOF**; **flux sampling**. +* **YAML I/O** following the cobra standard, plus geckopy's `ec-*` enzyme-constrained + fields. **SIF** export. **RAVEN-style Excel** export. + +The status of every RAVEN function (ported, cheatsheet-mapped to cobra, or explicitly +not ported) is documented function-by-function in +**[docs/raven_migration.md](docs/raven_migration.md)**. + +## Design principle + +The canonical in-memory object is always a [`cobra.Model`](https://cobrapy.readthedocs.io). +There is no parallel RAVEN struct, no `ravenCobraWrapper`-style adapter. RAVEN-specific +fields that cobra doesn't model natively (`rxnMiriams`, `metDeltaG`, +`rxnConfidenceScores`, …) live in cobra's `annotation` / `notes` dictionaries. This +avoids duplicating cobra's data model and keeps raven-python interoperable with the wider +COBRA ecosystem. + +## Status + +raven-python has been validated against MATLAB RAVEN on **Human-GEM** (5 Hart2015 cell-line +models, Jaccard 0.975–0.980 — see [docs/humangem_validation.md](docs/humangem_validation.md)). +The functional scope of the original RAVEN toolbox is covered with two principled +omissions: + +* **MetaCyc-based reconstruction** is not implemented and is flagged for removal from + MATLAB RAVEN as well — see [IMPROVEMENTS.md](IMPROVEMENTS.md) under `R-MetaCyc`. +* **Dynamic FBA** is not implemented — several maintained Python packages already cover + it ([`dfba`](https://pypi.org/project/dfba/), [`reframed`](https://pypi.org/project/reframed/), + [`mewpy`](https://pypi.org/project/mewpy/)). + +What's still open is catalogued in **[docs/todo.md](docs/todo.md)** (visualisation / Phase +6 is the main item). + +## Installation (development) + +```bash +git clone https://github.com/SysBioChalmers/raven-python +cd raven-python +pip install -e ".[dev]" +``` + +raven-python requires Python ≥ 3.11. Genome-scale (f)tINIT MILPs currently require **Gurobi** +([details on solver portability](docs/init_solver_benchmark.md)); toy and unit-test work +runs on the open-source GLPK. + +## Documentation + +See **[docs/README.md](docs/README.md)** for the documentation index. + +## Relationship to MATLAB RAVEN + +`raven-python` is a derivative work and is released under the same **GPL-3.0-or-later** +license. If you use it in scientific work, please cite the RAVEN 2 paper: + +> Wang H, Marcišauskas S, Sánchez BJ, Domenzain I, Hermansson D, Agren R, Nielsen J, +> Kerkhoven EJ. (2018) RAVEN 2.0: A versatile toolbox for metabolic network +> reconstruction and a case study on *Streptomyces coelicolor*. PLoS Comput Biol 14(10): +> e1006541. + +## License + +[GPL-3.0-or-later](LICENSE) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..faeeb1f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,84 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "raven-python" +version = "0.0.1" +description = "Reconstruction, Analysis and Visualization of Metabolic Networks in Python, a port of the RAVEN Toolbox built on cobrapy" +readme = "README.md" +requires-python = ">=3.11" +license = { text = "GPL-3.0-or-later" } +authors = [ + { name = "Eduard Kerkhoven", email = "eduardk@chalmers.se" }, +] +keywords = [ + "genome-scale-model", + "metabolic-model", + "reconstruction", + "raven", + "cobra", + "systems-biology", + "constraint-based-modeling", + "kegg", + "metacyc", + "tinit", +] +classifiers = [ + "Development Status :: 2 - Pre-Alpha", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Topic :: Scientific/Engineering :: Bio-Informatics", +] +dependencies = [ + "cobra>=0.29", + "numpy>=1.21", + "pandas>=2", + "scipy>=1.10", + "ruamel.yaml>=0.17", + "requests>=2.28", + "tqdm>=4.65", +] + +[project.optional-dependencies] +dev = [ + "pytest>=7", + "pytest-cov", + "ruff>=0.4", +] +excel = [ + "openpyxl>=3.1", +] +plotting = [ + "matplotlib>=3.5", +] + +[project.urls] +Homepage = "https://github.com/SysBioChalmers/raven-python" +Source = "https://github.com/SysBioChalmers/raven-python" +Issues = "https://github.com/SysBioChalmers/raven-python/issues" +"RAVEN MATLAB" = "https://github.com/SysBioChalmers/RAVEN" + +[tool.hatch.build.targets.wheel] +packages = ["src/raven_python"] + +[tool.pytest.ini_options] +testpaths = ["tests"] + +[tool.ruff] +line-length = 100 +target-version = "py311" + +[tool.ruff.lint] +select = ["E", "F", "W", "I", "UP", "B"] +ignore = [ + "E501", # line length handled by the formatter +] + +[tool.ruff.lint.per-file-ignores] +"tests/*" = ["E402"] diff --git a/src/raven_python/__init__.py b/src/raven_python/__init__.py new file mode 100644 index 0000000..591c4c1 --- /dev/null +++ b/src/raven_python/__init__.py @@ -0,0 +1,10 @@ +"""raven_python — Python counterpart of the RAVEN Toolbox, built on cobrapy. + +raven_python reuses cobrapy for simulation, standard analyses, SBML I/O, and model +manipulation, and provides the RAVEN-specific functionality on top: de novo +reconstruction (KEGG / homology), context-specific modeling (tINIT / ftINIT), +metabolic task validation, connectivity gap-filling, omics integration (HPA), +sub-cellular localisation, N-model comparison, and the RAVEN-style I/O formats. +""" + +__version__ = "0.0.1" From b7b69ac4f5cc00831df993fc718dd960ab82e0df Mon Sep 17 00:00:00 2001 From: Eduard Kerkhoven Date: Fri, 29 May 2026 22:45:40 +0200 Subject: [PATCH 2/7] Add GitHub Actions CI and the maintainer-scripts README --- .github/workflows/ci.yml | 48 ++++++++++++++++++++++++++++++++++++++++ scripts/README.md | 35 +++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+) create mode 100644 .github/workflows/ci.yml create mode 100644 scripts/README.md diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..3f6d19b --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,48 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + workflow_dispatch: + +# A push that obsoletes a previous run cancels it. +concurrency: + group: ci-${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + lint: + name: ruff + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: pip + cache-dependency-path: pyproject.toml + - run: pip install --upgrade pip + - run: pip install ruff + - run: ruff check . + + test: + name: pytest (py${{ matrix.python }}) + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python: ["3.11", "3.12", "3.13"] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python }} + cache: pip + cache-dependency-path: pyproject.toml + - run: pip install --upgrade pip + # ``-e .[dev,plotting,excel]`` so every optional extra is exercised. + # Gurobi is not installable on free runners; the relevant tests + # skip themselves when ``optlang.gurobi_interface`` cannot import. + - run: pip install -e ".[dev,plotting,excel]" + - run: pytest -q --maxfail=5 --durations=20 diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..67a3b36 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,35 @@ +# Maintainer scripts + +Release-time tooling. Not part of the installed package — run them from a checkout +with raven-python installed (`pip install -e .`). End users never need these. + +## `build_kegg_artefacts.py` + +Build the publishable KEGG artefact set from an arranged KEGG dump (see +`download_kegg_dump`): the gzipped-YAML reference model, the gzipped-TSV tables, +and (with `--hmms`) the per-domain pressed HMM libraries. Output is laid out ready +to upload as release assets. See [docs/maintaining_kegg_data.md](../docs/maintaining_kegg_data.md). + +```bash +python scripts/build_kegg_artefacts.py --keggdb keggdb --out artefacts # tables + model +python scripts/build_kegg_artefacts.py --keggdb keggdb --out artefacts --hmms --threads 8 +``` + +## `make_registry_snippet.py` + +After uploading the files to a release, compute their SHA256 and print the entry +to merge into the runtime registry — `raven_python.data._DATA_REGISTRY` (data) or +`raven_python.binaries._REGISTRY` (binary ZIP bundles). The checksum helper is shared +with the resolvers, so published checksums always match what `ensure_data` / +`ensure_binary` verify. + +```bash +# Data artefacts: +python scripts/make_registry_snippet.py data --dataset kegg --version kegg116 \ + --dir artefacts --base-url https://github.com/ORG/raven-python/releases/download/kegg-data-kegg116 + +# Binary bundle (ZIPs named ---.zip): +python scripts/make_registry_snippet.py binary --bundle blast --version 2.16.0 \ + --provides blastp makeblastdb --dir zips \ + --base-url https://github.com/ORG/raven-python/releases/download/blast-2.16.0 +``` From 50bea4006f174a3549e670fd3ea622f8e31083cf Mon Sep 17 00:00:00 2001 From: Eduard Kerkhoven Date: Fri, 29 May 2026 22:46:42 +0200 Subject: [PATCH 3/7] Add the foundation utilities: GPR, balance, parse, sort, validate --- src/raven_python/utils/__init__.py | 16 ++++ src/raven_python/utils/balance.py | 89 +++++++++++++++++++++ src/raven_python/utils/gpr.py | 119 +++++++++++++++++++++++++++++ src/raven_python/utils/parse.py | 33 ++++++++ src/raven_python/utils/sort.py | 21 +++++ src/raven_python/utils/validate.py | 86 +++++++++++++++++++++ tests/test_utils_balance.py | 76 ++++++++++++++++++ tests/test_utils_gpr.py | 84 ++++++++++++++++++++ tests/test_utils_sort.py | 42 ++++++++++ tests/test_utils_validate.py | 80 +++++++++++++++++++ 10 files changed, 646 insertions(+) create mode 100644 src/raven_python/utils/__init__.py create mode 100644 src/raven_python/utils/balance.py create mode 100644 src/raven_python/utils/gpr.py create mode 100644 src/raven_python/utils/parse.py create mode 100644 src/raven_python/utils/sort.py create mode 100644 src/raven_python/utils/validate.py create mode 100644 tests/test_utils_balance.py create mode 100644 tests/test_utils_gpr.py create mode 100644 tests/test_utils_sort.py create mode 100644 tests/test_utils_validate.py diff --git a/src/raven_python/utils/__init__.py b/src/raven_python/utils/__init__.py new file mode 100644 index 0000000..7127bdd --- /dev/null +++ b/src/raven_python/utils/__init__.py @@ -0,0 +1,16 @@ +"""Shared helpers — GPR linting, elemental balance, model curation checks, id sorting.""" +from raven_python.utils.balance import ElementalBalance, get_elemental_balance +from raven_python.utils.gpr import GPRIssue, find_non_dnf_grrules, is_dnf +from raven_python.utils.sort import sort_identifiers +from raven_python.utils.validate import ModelIssue, check_model + +__all__ = [ + "ElementalBalance", + "GPRIssue", + "ModelIssue", + "check_model", + "find_non_dnf_grrules", + "get_elemental_balance", + "is_dnf", + "sort_identifiers", +] diff --git a/src/raven_python/utils/balance.py b/src/raven_python/utils/balance.py new file mode 100644 index 0000000..ee64ab4 --- /dev/null +++ b/src/raven_python/utils/balance.py @@ -0,0 +1,89 @@ +"""Check the elemental balance of reactions, distinguishing *unbalanced* from +*unknown* (missing formula). + +cobra's ``reaction.check_mass_balance()`` silently treats a missing formula as +empty, so a reaction can look "unbalanced" — or even balanced — when the truth is +that the data is incomplete. This module checks for missing formulas first and +returns a graded status +per reaction (``balanced`` / ``unbalanced`` / ``unknown``) plus the element +imbalance — over a batch, as structured data. +""" +from __future__ import annotations + +from dataclasses import dataclass, field + +import cobra + + +@dataclass(frozen=True) +class ElementalBalance: + """Balance result for one reaction. + + Attributes + ---------- + reaction_id + ID of the reaction. + status + ``"balanced"`` — elements balance; + ``"unbalanced"`` — they do not (see ``imbalance``); + ``"unknown"`` — at least one metabolite has no formula, so it cannot be + determined (cobra would silently miscount these). + imbalance + Element → net coefficient (products − reactants), only for + ``"unbalanced"``; empty otherwise. Charge is not included. + """ + + reaction_id: str + status: str + imbalance: dict[str, float] = field(default_factory=dict) + + +def get_elemental_balance( + model: cobra.Model, reactions=None +) -> list[ElementalBalance]: + """Check whether reactions are elementally balanced. + Parameters + ---------- + reactions + Reaction IDs/objects to check; default all reactions. (Boundary + reactions exchange mass with the environment and will read as + ``unbalanced`` — filter them out if that is not wanted.) + + Returns + ------- + list of ElementalBalance + One entry per checked reaction, in model order. + """ + if reactions is None: + rxns = list(model.reactions) + else: + if isinstance(reactions, (str, cobra.Reaction)): + reactions = [reactions] + rxns = [ + r if isinstance(r, cobra.Reaction) else model.reactions.get_by_id(r) + for r in reactions + ] + + results: list[ElementalBalance] = [] + for rxn in rxns: + if not rxn.metabolites: + # A reaction with no metabolites used to fall through to ``balanced`` + # (vacuously) because ``any()`` over the empty list is False and the + # zero-element imbalance dict is empty. Treat the no-formula case + # (zero formulae present) as ``unknown``: we can't determine balance + # for a reaction without stoichiometry. + results.append(ElementalBalance(rxn.id, "unknown")) + continue + if any(not met.formula for met in rxn.metabolites): + results.append(ElementalBalance(rxn.id, "unknown")) + continue + imbalance = { + element: amount + for element, amount in rxn.check_mass_balance().items() + if element != "charge" + } + if imbalance: + results.append(ElementalBalance(rxn.id, "unbalanced", imbalance)) + else: + results.append(ElementalBalance(rxn.id, "balanced")) + return results diff --git a/src/raven_python/utils/gpr.py b/src/raven_python/utils/gpr.py new file mode 100644 index 0000000..2e2122d --- /dev/null +++ b/src/raven_python/utils/gpr.py @@ -0,0 +1,119 @@ +"""GPR (gene-protein-reaction rule) linting. + +Flag GPRs that are *not* in disjunctive normal form ("OR of AND-complexes"), via cobra's +GPR AST. GPR syntax *normalisation* is already done by cobra on assignment, so it isn't +re-implemented here. + +Part (2) has no cobrapy equivalent and is ported here, reworked onto cobra's +GPR AST instead of RAVEN's brittle substring search. The relevant property is +**disjunctive normal form (DNF)**: an OR of AND-clauses of single genes, e.g. +``(G1 and G2) or G3``. Rules where an AND contains an OR — e.g. +``(G1 or G2) and (G3 or G4)`` — are *valid* for cobra but ambiguous for the +isoenzyme/complex reasoning used across RAVEN/GECKO, and ``expand_model`` +(see :mod:`raven_python.manipulation.expand`) only does something for DNF rules. +:func:`find_non_dnf_grrules` surfaces them as structured data rather than, as +RAVEN did, only printing a warning. +""" +from __future__ import annotations + +import ast +from dataclasses import dataclass + +import cobra +from cobra.core.gene import GPR + + +def _contains_or(node: ast.AST | None) -> bool: + """True if ``node``'s subtree contains an OR operator anywhere.""" + if isinstance(node, ast.BoolOp): + if isinstance(node.op, ast.Or): + return True + return any(_contains_or(value) for value in node.values) + return False + + +def _is_dnf_node(node: ast.AST | None) -> bool: + """True if the AST rooted at ``node`` is in disjunctive normal form. + + DNF here means no AND operator has an OR anywhere beneath it, i.e. the + rule is a single gene, a pure AND-complex, or an OR of those. + """ + if node is None or isinstance(node, ast.Name): + return True + if isinstance(node, ast.BoolOp): + if isinstance(node.op, ast.And): + return not any(_contains_or(value) for value in node.values) + # OR: every disjunct must itself be DNF + return all(_is_dnf_node(value) for value in node.values) + # Unknown node type: don't flag it as a problem. + return True + + +def is_dnf(gpr: GPR | str | None) -> bool: + """Return whether a GPR is in disjunctive normal form (OR of AND-complexes). + + Parameters + ---------- + gpr + A cobra :class:`~cobra.core.gene.GPR`, a grRule string, or ``None``. + An empty/``None`` rule is trivially DNF. + + Examples + -------- + >>> is_dnf("(G1 and G2) or G3") + True + >>> is_dnf("(G1 or G2) and G3") + False + """ + if isinstance(gpr, str): + gpr = GPR.from_string(gpr) + if gpr is None: + return True + return _is_dnf_node(gpr.body) + + +@dataclass(frozen=True) +class GPRIssue: + """A reaction whose GPR is flagged by the linter. + + Attributes + ---------- + reaction_id + ID of the reaction. + gpr + The (already cobra-normalised) grRule string. + reason + Human-readable explanation of why it was flagged. + """ + + reaction_id: str + gpr: str + reason: str + + +_NON_DNF_REASON = ( + "GPR is not in disjunctive normal form (an AND clause contains an OR). " + "Isoenzyme/complex reasoning and expand_model assume an OR of AND-complexes, " + 'e.g. rewrite "(G1 or G2) and (G3 or G4)" as ' + '"(G1 and G3) or (G1 and G4) or (G2 and G3) or (G2 and G4)".' +) + + +def find_non_dnf_grrules(model: cobra.Model) -> list[GPRIssue]: + """Find reactions whose GPR is not in disjunctive normal form ("OR of AND-complexes"). + + Uses cobra's GPR AST. Reactions with no GPR are skipped. + + Returns + ------- + list of GPRIssue + One entry per flagged reaction, in model reaction order. Empty if all + GPRs are simple OR-of-AND-complexes. + """ + issues: list[GPRIssue] = [] + for rxn in model.reactions: + if not rxn.gene_reaction_rule: + continue + if not is_dnf(rxn.gpr): + issues.append(GPRIssue(rxn.id, rxn.gene_reaction_rule, _NON_DNF_REASON)) + return issues diff --git a/src/raven_python/utils/parse.py b/src/raven_python/utils/parse.py new file mode 100644 index 0000000..8068f6c --- /dev/null +++ b/src/raven_python/utils/parse.py @@ -0,0 +1,33 @@ +"""Small parsing helpers shared across raven_python.""" +from __future__ import annotations + +import re + +# A metabolite written as ``name[comp]``. The name is greedy so that, for a +# pathological name that itself contains brackets, the *last* ``[...]`` is taken +# as the compartment (matching RAVEN getIndexes' ``max(strfind('['))`` rule). +_NAME_COMP_RE = re.compile(r"^(?P.+)\[(?P[^\[\]]+)\]$") + + +def parse_name_comp(token: str) -> tuple[str, str | None]: + """Split a ``name[comp]`` token into ``(name, compartment)``. + + This is the one genuinely cobra-absent sliver of RAVEN ``getIndexes``' + ``metcomps`` mode and ``addRxns`` eqnType 3: resolving a metabolite written + as its *name* plus a compartment in square brackets, e.g. ``"ATP[c]"``. + + Returns ``(name, None)`` when there is no trailing ``[...]``. + + Examples + -------- + >>> parse_name_comp("ATP[c]") + ('ATP', 'c') + >>> parse_name_comp("ATP") + ('ATP', None) + >>> parse_name_comp("weird[name][m]") + ('weird[name]', 'm') + """ + match = _NAME_COMP_RE.match(token.strip()) + if match: + return match.group("name").strip(), match.group("comp").strip() + return token.strip(), None diff --git a/src/raven_python/utils/sort.py b/src/raven_python/utils/sort.py new file mode 100644 index 0000000..a8641a8 --- /dev/null +++ b/src/raven_python/utils/sort.py @@ -0,0 +1,21 @@ +"""Sort a model's identifiers alphabetically — useful for deterministic, +diff-friendly output. + +cobra's ``DictList.sort`` reorders one list (and rebuilds its lookup index), but +there is no single "sort the whole model" call; this provides it. +""" +from __future__ import annotations + +import cobra + + +def sort_identifiers(model: cobra.Model) -> cobra.Model: + """Sort reactions, metabolites and genes alphabetically by ID, in place. + + Returns the same (mutated) model for convenience. Compartments are a plain + dict and are emitted sorted by writers as needed. + """ + model.reactions.sort(key=lambda r: r.id) + model.metabolites.sort(key=lambda m: m.id) + model.genes.sort(key=lambda g: g.id) + return model diff --git a/src/raven_python/utils/validate.py b/src/raven_python/utils/validate.py new file mode 100644 index 0000000..c08df48 --- /dev/null +++ b/src/raven_python/utils/validate.py @@ -0,0 +1,86 @@ +"""Curation checks for a model. + +A QC bundle cobra has no single call for: orphaned objects, empty reactions, +duplicated metabolite ``name + compartment``, empty names, and objective sanity. +:func:`check_model` returns these as structured :class:`ModelIssue` records. +""" +from __future__ import annotations + +from dataclasses import dataclass + +import cobra + + +@dataclass(frozen=True) +class ModelIssue: + """One curation issue found in a model. + + Attributes + ---------- + category + Machine-readable kind, e.g. ``"orphan_metabolite"``, ``"empty_reaction"``, + ``"orphan_gene"``, ``"duplicate_name_compartment"``, + ``"empty_metabolite_name"``, ``"objective"``. + object_id + ID of the offending object, or ``None`` for model-level issues. + message + Human-readable description. + """ + + category: str + object_id: str | None + message: str + + +def check_model(model: cobra.Model) -> list[ModelIssue]: + """Run curation checks on a model and return the issues found. + + Does not + raise; returns a (possibly empty) list of :class:`ModelIssue`. + """ + issues: list[ModelIssue] = [] + + for met in model.metabolites: + if not met.reactions: + issues.append( + ModelIssue("orphan_metabolite", met.id, f"Metabolite {met.id!r} is not used in any reaction.") + ) + if not (met.name and str(met.name).strip()): + issues.append( + ModelIssue("empty_metabolite_name", met.id, f"Metabolite {met.id!r} has no name.") + ) + + for gene in model.genes: + if not gene.reactions: + issues.append( + ModelIssue("orphan_gene", gene.id, f"Gene {gene.id!r} is not associated with any reaction.") + ) + + for rxn in model.reactions: + if not rxn.metabolites: + issues.append( + ModelIssue("empty_reaction", rxn.id, f"Reaction {rxn.id!r} has no metabolites.") + ) + + by_name_comp: dict[tuple[str, str], list[str]] = {} + for met in model.metabolites: + by_name_comp.setdefault((met.name, met.compartment), []).append(met.id) + for (name, comp), ids in by_name_comp.items(): + if name and len(ids) > 1: + issues.append( + ModelIssue( + "duplicate_name_compartment", + None, + f"{len(ids)} metabolites share name {name!r} in compartment {comp!r}: {sorted(ids)}", + ) + ) + + objective_rxns = [r.id for r in model.reactions if r.objective_coefficient != 0] + if not objective_rxns: + issues.append(ModelIssue("objective", None, "No reaction has a nonzero objective coefficient.")) + elif len(objective_rxns) > 1: + issues.append( + ModelIssue("objective", None, f"Multiple objective reactions: {sorted(objective_rxns)}") + ) + + return issues diff --git a/tests/test_utils_balance.py b/tests/test_utils_balance.py new file mode 100644 index 0000000..aa1e47e --- /dev/null +++ b/tests/test_utils_balance.py @@ -0,0 +1,76 @@ +"""Tests for get_elemental_balance (getElementalBalance port).""" +import cobra +import pytest + +from raven_python.utils import ElementalBalance, get_elemental_balance + + +@pytest.fixture +def model(): + m = cobra.Model("t") + m.add_metabolites( + [ + cobra.Metabolite("a_c", formula="C6H12O6", charge=0, compartment="c"), + cobra.Metabolite("b_c", formula="C6H12O6", charge=0, compartment="c"), + cobra.Metabolite("c_c", formula="C3H6O3", charge=0, compartment="c"), + cobra.Metabolite("n_c", compartment="c"), # no formula + ] + ) + r_bal = cobra.Reaction("R_bal") + m.add_reactions([r_bal]) + r_bal.build_reaction_from_string("a_c --> b_c") # C6H12O6 -> C6H12O6 + r_unbal = cobra.Reaction("R_unbal") + m.add_reactions([r_unbal]) + r_unbal.build_reaction_from_string("a_c --> c_c") # C6H12O6 -> C3H6O3 + r_unknown = cobra.Reaction("R_unknown") + m.add_reactions([r_unknown]) + r_unknown.build_reaction_from_string("a_c --> n_c") # n_c has no formula + return m + + +def test_balanced(model): + (res,) = get_elemental_balance(model, ["R_bal"]) + assert res == ElementalBalance("R_bal", "balanced", {}) + + +def test_unbalanced_reports_imbalance(model): + (res,) = get_elemental_balance(model, ["R_unbal"]) + assert res.status == "unbalanced" + # products - reactants: C3H6O3 - C6H12O6 = -C3H6O3 + assert res.imbalance == {"C": -3.0, "H": -6.0, "O": -3.0} + + +def test_missing_formula_is_unknown_not_silently_wrong(model): + # cobra's check_mass_balance alone would silently report an imbalance here; + # we flag it as unknown instead. + (res,) = get_elemental_balance(model, ["R_unknown"]) + assert res.status == "unknown" + assert res.imbalance == {} + + +def test_all_reactions_default(model): + results = get_elemental_balance(model) + assert {r.reaction_id: r.status for r in results} == { + "R_bal": "balanced", + "R_unbal": "unbalanced", + "R_unknown": "unknown", + } + + +def test_charge_excluded(model): + # give a charge imbalance but keep elements balanced -> still "balanced" + model.metabolites.get_by_id("b_c").charge = 1 + (res,) = get_elemental_balance(model, ["R_bal"]) + assert res.status == "balanced" + + +# --- regression: empty reaction → unknown (known_issues.md F5) ------------- + +def test_empty_reaction_is_unknown(model): + """A reaction with no metabolites used to be reported `balanced` + vacuously (any() over an empty list is False and check_mass_balance + returns no imbalance). Now reports `unknown`.""" + empty = cobra.Reaction("R_empty", lower_bound=0, upper_bound=1000) + model.add_reactions([empty]) + (res,) = get_elemental_balance(model, ["R_empty"]) + assert res.status == "unknown" diff --git a/tests/test_utils_gpr.py b/tests/test_utils_gpr.py new file mode 100644 index 0000000..275d020 --- /dev/null +++ b/tests/test_utils_gpr.py @@ -0,0 +1,84 @@ +"""Tests for raven_python.utils.gpr (GPR linting).""" +import cobra +import pytest + +from raven_python.utils import GPRIssue, find_non_dnf_grrules, is_dnf + + +@pytest.mark.parametrize( + "rule", + [ + "", + "G1", + "G1 and G2", + "G1 or G2", + "G1 and G2 and G3", + "G1 or G2 or G3", + "(G1 and G2) or G3", + "(G1 and G2) or (G3 and G4)", + "G1 or (G2 and G3)", + ], +) +def test_is_dnf_true(rule): + assert is_dnf(rule) is True + + +@pytest.mark.parametrize( + "rule", + [ + "(G1 or G2) and G3", + "G1 and (G2 or G3)", + "(G1 or G2) and (G3 or G4)", + "G1 and (G2 or (G3 and G4))", + ], +) +def test_is_dnf_false(rule): + assert is_dnf(rule) is False + + +def test_is_dnf_accepts_gpr_and_none(): + from cobra.core.gene import GPR + + assert is_dnf(GPR.from_string("(G1 or G2) and G3")) is False + assert is_dnf(GPR.from_string("G1 or G2")) is True + assert is_dnf(None) is True + + +def test_is_dnf_independent_of_formatting(): + # cobra normalises on assignment, so casing/whitespace cannot change the verdict. + assert is_dnf("(G1 OR G2) AND G3") is False + assert is_dnf("( G1 and G2 ) or G3") is True + + +def _model_with_rules(rules: dict[str, str]) -> cobra.Model: + model = cobra.Model("t") + model.add_reactions([cobra.Reaction(rid) for rid in rules]) + for rid, rule in rules.items(): + model.reactions.get_by_id(rid).gene_reaction_rule = rule + return model + + +def test_find_non_dnf_grrules_flags_only_offenders(): + model = _model_with_rules( + { + "R_ok_single": "G1", + "R_ok_complex": "G1 and G2", + "R_ok_dnf": "(G1 and G2) or G3", + "R_no_gpr": "", + "R_bad_1": "(G1 or G2) and G3", + "R_bad_2": "(G1 or G2) and (G3 or G4)", + } + ) + + issues = find_non_dnf_grrules(model) + + assert [i.reaction_id for i in issues] == ["R_bad_1", "R_bad_2"] + assert all(isinstance(i, GPRIssue) for i in issues) + assert all("disjunctive normal form" in i.reason for i in issues) + # the reported GPR is the cobra-normalised string + assert issues[0].gpr == "(G1 or G2) and G3" + + +def test_find_non_dnf_grrules_empty_when_all_clean(): + model = _model_with_rules({"R1": "G1 or G2", "R2": "(G1 and G2) or G3"}) + assert find_non_dnf_grrules(model) == [] diff --git a/tests/test_utils_sort.py b/tests/test_utils_sort.py new file mode 100644 index 0000000..18bca24 --- /dev/null +++ b/tests/test_utils_sort.py @@ -0,0 +1,42 @@ +"""Tests for sort_identifiers and write_yaml_model(sort_ids=True).""" +import cobra + +from raven_python.io import read_yaml_model, write_yaml_model +from raven_python.manipulation import add_reactions_from_equations +from raven_python.utils import sort_identifiers + + +def _model(): + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("b_c", "a_c")]) + add_reactions_from_equations( + m, + [ + {"id": "R2", "equation": "a_c --> b_c", "gene_reaction_rule": "GB"}, + {"id": "R1", "equation": "b_c --> a_c", "gene_reaction_rule": "GA"}, + ], + ) + return m + + +def test_sort_identifiers_orders_everything(): + m = _model() + sort_identifiers(m) + assert [r.id for r in m.reactions] == ["R1", "R2"] + assert [x.id for x in m.metabolites] == ["a_c", "b_c"] + assert [g.id for g in m.genes] == ["GA", "GB"] + # lookup index still intact after sorting + assert m.reactions.get_by_id("R2").id == "R2" + + +def test_write_yaml_sort_ids_does_not_mutate(tmp_path): + m = _model() + order_before = [r.id for r in m.reactions] + out = tmp_path / "m.yml" + write_yaml_model(m, out, sort_ids=True) + assert [r.id for r in m.reactions] == order_before # model untouched + # but the file is sorted + text = out.read_text() + assert text.index("R1") < text.index("R2") + reloaded = read_yaml_model(out) + assert [r.id for r in reloaded.reactions] == ["R1", "R2"] diff --git a/tests/test_utils_validate.py b/tests/test_utils_validate.py new file mode 100644 index 0000000..2d38e6f --- /dev/null +++ b/tests/test_utils_validate.py @@ -0,0 +1,80 @@ +"""Tests for check_model (the surviving checks of checkModelStruct).""" +import cobra +import pytest + +from raven_python.manipulation import add_reactions_from_equations +from raven_python.utils import ModelIssue, check_model + + +def _categories(issues, category): + return [i.object_id for i in issues if i.category == category] + + +@pytest.fixture +def model(): + m = cobra.Model("t") + m.add_metabolites( + [ + cobra.Metabolite("a_c", name="A", compartment="c"), + cobra.Metabolite("b_c", name="B", compartment="c"), + ] + ) + add_reactions_from_equations( + m, [{"id": "R1", "equation": "a_c --> b_c", "gene_reaction_rule": "G1"}] + ) + m.reactions.get_by_id("R1").objective_coefficient = 1 + return m + + +def test_clean_model_has_no_issues(model): + assert check_model(model) == [] + + +def test_orphan_metabolite(model): + model.add_metabolites([cobra.Metabolite("orphan_c", name="Orphan", compartment="c")]) + assert "orphan_c" in _categories(check_model(model), "orphan_metabolite") + + +def test_orphan_gene(model): + model.genes.append(cobra.core.gene.Gene("G_lonely")) + assert "G_lonely" in _categories(check_model(model), "orphan_gene") + + +def test_empty_reaction(model): + model.add_reactions([cobra.Reaction("R_empty")]) + assert "R_empty" in _categories(check_model(model), "empty_reaction") + + +def test_empty_metabolite_name(model): + model.add_metabolites([cobra.Metabolite("noname_c", compartment="c")]) + # also an orphan, but we check the name category specifically + assert "noname_c" in _categories(check_model(model), "empty_metabolite_name") + + +def test_duplicate_name_compartment(model): + # second metabolite named "A" in compartment c + dup = cobra.Metabolite("a2_c", name="A", compartment="c") + model.add_metabolites([dup]) + model.reactions.get_by_id("R1").add_metabolites({dup: -1}) # keep it used + issues = [i for i in check_model(model) if i.category == "duplicate_name_compartment"] + assert len(issues) == 1 + assert "a_c" in issues[0].message and "a2_c" in issues[0].message + + +def test_no_objective(model): + model.reactions.get_by_id("R1").objective_coefficient = 0 + cats = [i.category for i in check_model(model)] + assert "objective" in cats + + +def test_multiple_objectives(model): + add_reactions_from_equations(model, [{"id": "R2", "equation": "b_c --> a_c"}]) + model.reactions.get_by_id("R2").objective_coefficient = 1 + obj_issues = [i for i in check_model(model) if i.category == "objective"] + assert len(obj_issues) == 1 + assert "Multiple" in obj_issues[0].message + + +def test_returns_model_issue_instances(model): + model.add_reactions([cobra.Reaction("R_empty")]) + assert all(isinstance(i, ModelIssue) for i in check_model(model)) From 4bc0d6ee29070a0bad4e347a78947c7fae8199a0 Mon Sep 17 00:00:00 2001 From: Eduard Kerkhoven Date: Fri, 29 May 2026 22:46:59 +0200 Subject: [PATCH 4/7] Add the model-manipulation layer (add, remove, transport, merge, etc.) --- src/raven_python/manipulation/__init__.py | 36 ++ src/raven_python/manipulation/add.py | 345 ++++++++++++++++++ src/raven_python/manipulation/change.py | 125 +++++++ src/raven_python/manipulation/compartments.py | 196 ++++++++++ src/raven_python/manipulation/expand.py | 124 +++++++ src/raven_python/manipulation/irreversible.py | 72 ++++ src/raven_python/manipulation/merge.py | 146 ++++++++ src/raven_python/manipulation/parameters.py | 78 ++++ src/raven_python/manipulation/remove.py | 120 ++++++ src/raven_python/manipulation/simplify.py | 229 ++++++++++++ src/raven_python/manipulation/transfer.py | 144 ++++++++ src/raven_python/manipulation/transport.py | 157 ++++++++ tests/test_change_grrules.py | 49 +++ tests/test_manipulation_add.py | 278 ++++++++++++++ tests/test_manipulation_change.py | 93 +++++ tests/test_manipulation_compartments.py | 139 +++++++ tests/test_manipulation_expand.py | 288 +++++++++++++++ tests/test_manipulation_irreversible.py | 144 ++++++++ tests/test_manipulation_merge.py | 136 +++++++ tests/test_manipulation_remove.py | 97 +++++ tests/test_manipulation_simplify.py | 184 ++++++++++ tests/test_manipulation_transfer.py | 137 +++++++ tests/test_manipulation_transport.py | 98 +++++ tests/test_parameters.py | 60 +++ 24 files changed, 3475 insertions(+) create mode 100644 src/raven_python/manipulation/__init__.py create mode 100644 src/raven_python/manipulation/add.py create mode 100644 src/raven_python/manipulation/change.py create mode 100644 src/raven_python/manipulation/compartments.py create mode 100644 src/raven_python/manipulation/expand.py create mode 100644 src/raven_python/manipulation/irreversible.py create mode 100644 src/raven_python/manipulation/merge.py create mode 100644 src/raven_python/manipulation/parameters.py create mode 100644 src/raven_python/manipulation/remove.py create mode 100644 src/raven_python/manipulation/simplify.py create mode 100644 src/raven_python/manipulation/transfer.py create mode 100644 src/raven_python/manipulation/transport.py create mode 100644 tests/test_change_grrules.py create mode 100644 tests/test_manipulation_add.py create mode 100644 tests/test_manipulation_change.py create mode 100644 tests/test_manipulation_compartments.py create mode 100644 tests/test_manipulation_expand.py create mode 100644 tests/test_manipulation_irreversible.py create mode 100644 tests/test_manipulation_merge.py create mode 100644 tests/test_manipulation_remove.py create mode 100644 tests/test_manipulation_simplify.py create mode 100644 tests/test_manipulation_transfer.py create mode 100644 tests/test_manipulation_transport.py create mode 100644 tests/test_parameters.py diff --git a/src/raven_python/manipulation/__init__.py b/src/raven_python/manipulation/__init__.py new file mode 100644 index 0000000..074c36f --- /dev/null +++ b/src/raven_python/manipulation/__init__.py @@ -0,0 +1,36 @@ +"""Generic cobra.Model structural transforms that cobrapy does not cover cleanly: +reaction building from equations, batch GPR / bound changes, irreversibility splitting, +isozyme expansion, compartment merge / copy, and model merging by name.""" +from .add import add_reactions_from_equations +from .change import change_gene_reaction_rules, change_reaction_equations +from .expand import expand_model +from .irreversible import convert_to_irreversible +from .merge import merge_models +from .parameters import set_variance_bounds +from .remove import remove_genes, remove_metabolites +from .simplify import ( + constrain_reversible_reactions, + group_linear_reactions, + remove_dead_end_reactions, + remove_duplicate_reactions, +) +from .transfer import add_reactions_from_model +from .transport import add_transport_reactions + +__all__ = [ + "add_reactions_from_equations", + "add_reactions_from_model", + "add_transport_reactions", + "change_gene_reaction_rules", + "change_reaction_equations", + "constrain_reversible_reactions", + "convert_to_irreversible", + "expand_model", + "group_linear_reactions", + "merge_models", + "remove_dead_end_reactions", + "remove_duplicate_reactions", + "remove_genes", + "remove_metabolites", + "set_variance_bounds", +] diff --git a/src/raven_python/manipulation/add.py b/src/raven_python/manipulation/add.py new file mode 100644 index 0000000..3842297 --- /dev/null +++ b/src/raven_python/manipulation/add.py @@ -0,0 +1,345 @@ +"""Add reactions to a model from equation strings. + +Most of the equivalent MATLAB code is struct-of-arrays bookkeeping (padding parallel +``rxnNames`` / ``lb`` / ``ub`` / ``grRules`` / ... fields) that does not exist in +cobra, where each ``Reaction`` carries its own attributes. cobra also already +covers a large part of the *behaviour*: + +* ``Reaction.build_reaction_from_string`` parses equation strings, coefficients, + and reversibility arrows (``<=>``, ``-->``, ``=>``) and creates unknown + metabolites — but only matching metabolites **by ID**, and it leaves new + metabolites with ``compartment=None``. +* assigning ``reaction.gene_reaction_rule`` auto-creates ``Gene`` objects. + +So this port keeps only the parts cobra lacks: + +* **name-based matching** — interpret equation tokens as metabolite *names* + (RAVEN eqnType 2) or as ``name[comp]`` (eqnType 3), not just IDs; +* **correct compartment** assignment for newly created metabolites; +* **strict policies** — optionally *error* (rather than silently create) on + unknown metabolites or genes, and always error on a duplicate reaction ID + (cobra silently ignores those). + +Instead of RAVEN's ``eqnType`` integer (1/2/3) the matching mode is a readable +keyword: ``mets_by="id"`` or ``mets_by="name"``, with ``name[comp]`` recognised +automatically. See IMPROVEMENTS.md (A-series) for the rationale. +""" +from __future__ import annotations + +import re +import warnings +from collections import OrderedDict +from collections.abc import Mapping, Sequence + +import cobra +from cobra import Metabolite, Reaction +from cobra.core.gene import GPR + +from raven_python.utils.parse import parse_name_comp + +# Reversibility arrows. ``<=>`` must be tried before ``=>`` (it contains it). +_REVERSIBLE_ARROWS = ("<=>",) +_FORWARD_ARROWS = ("-->", "->", "=>") + + +def _split_equation(equation: str) -> tuple[str, str, bool]: + """Split an equation into (lhs, rhs, reversible) on its arrow.""" + for arrow in _REVERSIBLE_ARROWS: + if arrow in equation: + lhs, rhs = equation.split(arrow, 1) + return lhs, rhs, True + for arrow in _FORWARD_ARROWS: + if arrow in equation: + lhs, rhs = equation.split(arrow, 1) + return lhs, rhs, False + raise ValueError(f"No reaction arrow (<=>, -->, =>) found in equation: {equation!r}") + + +def _parse_side(side: str) -> list[tuple[float, str, str | None]]: + """Parse one side of an equation into ``[(coefficient, token, fallback), ...]``. + + The ``fallback`` slot is for the ambiguous ``" "`` shape: when + matching by name, ``"2 oxoglutarate"`` could be either ``coeff=2, name="oxoglutarate"`` + or ``coeff=1, name="2 oxoglutarate"`` (a real chemistry name). We return the + coefficient-split form as the primary and the full term as the fallback; the + resolver picks whichever matches an existing metabolite. Pure-number heads + with no name (``"2"``) and pure-name terms (``"glucose"``) have no fallback. + """ + terms: list[tuple[float, str, str | None]] = [] + for raw in side.split(" + "): + term = raw.strip() + if not term: + continue + head, _, tail = term.partition(" ") + try: + coeff = float(head) + token = tail.strip() + except ValueError: + coeff, token = 1.0, term + fallback = None + else: + # Coefficient-split succeeded. Keep the full term as a fallback when + # the tail is non-empty so name-resolution can re-try it as one token. + fallback = term if token else None + if not token: + raise ValueError(f"Missing metabolite after coefficient in term: {raw!r}") + terms.append((coeff, token, fallback)) + return terms + + +def _new_met_id(model: cobra.Model, prefix: str) -> str: + """Next free ```` metabolite ID (RAVEN m1, m2, ... scheme).""" + pattern = re.compile(rf"^{re.escape(prefix)}(\d+)$") + used = [int(m.group(1)) for met in model.metabolites if (m := pattern.match(met.id))] + n = max(used) + 1 if used else 1 + while f"{prefix}{n}" in model.metabolites: + n += 1 + return f"{prefix}{n}" + + +def _try_existing( + model: cobra.Model, token: str, *, mets_by: str, compartment: str | None +) -> Metabolite | None: + """Look up ``token`` as an existing metabolite (no creation, no side effects). + + Returns the matching metabolite or ``None``. Used by ``_stoichiometry`` to + disambiguate the ``" "`` shape: if a metabolite whose *name* + (or id) literally contains a leading number exists, prefer it over splitting + the number off as a coefficient. + """ + name, comp = parse_name_comp(token) + if mets_by == "id" and comp is None: + return model.metabolites.get_by_id(token) if token in model.metabolites else None + target_comp = comp if comp is not None else compartment + if target_comp is None: + return None + for met in model.metabolites: + if met.name == name and met.compartment == target_comp: + return met + return None + + +def _resolve_metabolite( + model: cobra.Model, + token: str, + *, + mets_by: str, + compartment: str | None, + allow_new_mets: bool, + new_met_prefix: str, +) -> Metabolite: + """Resolve an equation token to an existing or newly created Metabolite.""" + name, comp = parse_name_comp(token) + + if mets_by == "id" and comp is None: + # token is a metabolite ID + if token in model.metabolites: + return model.metabolites.get_by_id(token) + if not allow_new_mets: + raise ValueError( + f"Unknown metabolite ID {token!r}; pass allow_new_mets=True to create it." + ) + if compartment is None: + raise ValueError( + f"Cannot create metabolite {token!r}: no compartment given." + ) + _warn_unknown_compartment(model, compartment, token) + met = Metabolite(token, compartment=compartment) + model.add_metabolites([met]) + return met + + # name-based (mets_by="name") or explicit name[comp] + target_comp = comp if comp is not None else compartment + if target_comp is None: + raise ValueError( + f"Metabolite {token!r} matched by name needs a compartment; " + "pass compartment=... or use the name[comp] syntax." + ) + if comp is not None and target_comp not in model.compartments and not allow_new_mets: + raise ValueError(f"Compartment {target_comp!r} is not in the model.") + + matches = [ + met + for met in model.metabolites + if met.name == name and met.compartment == target_comp + ] + if matches: + return matches[0] + if not allow_new_mets: + raise ValueError( + f"No metabolite named {name!r} in compartment {target_comp!r}; " + "pass allow_new_mets=True to create it." + ) + _warn_unknown_compartment(model, target_comp, name) + met = Metabolite(_new_met_id(model, new_met_prefix), name=name, compartment=target_comp) + model.add_metabolites([met]) + return met + + +def _warn_unknown_compartment(model: cobra.Model, compartment: str, identifier: str) -> None: + """Warn when a new metabolite would be born into a not-yet-registered compartment. + + Both ``mets_by`` paths previously created the metabolite without validating + the compartment, so a typo (``"cyto"`` for ``"c"``) silently produced a + one-metabolite ghost compartment. cobra inherits the compartment from the + first metabolite assigned to it, so the fix is a warning, not a hard error. + """ + known = set(model.compartments) | set(model._compartments) + if compartment not in known: + warnings.warn( + f"Creating metabolite {identifier!r} in unregistered compartment " + f"{compartment!r} (existing: {sorted(known) or 'none'}); " + "add the compartment first or check for a typo.", + stacklevel=5, + ) + + +def _stoichiometry( + model: cobra.Model, + equation: str, + *, + mets_by: str, + compartment: str | None, + allow_new_mets: bool, + new_met_prefix: str, +) -> tuple[dict[Metabolite, float], bool]: + """Parse an equation into a {Metabolite: net coefficient} dict + reversibility.""" + lhs, rhs, reversible = _split_equation(equation) + coeffs: OrderedDict[Metabolite, float] = OrderedDict() + had_terms = False + for sign, side in ((-1.0, lhs), (1.0, rhs)): + for coeff, token, fallback in _parse_side(side): + had_terms = True + # " " is ambiguous when the name itself starts with a + # number (e.g. "2 oxoglutarate"). Prefer the full-term interpretation + # when it matches an existing metabolite — otherwise fall through to + # the coefficient-split form. + met = None + if fallback is not None: + met = _try_existing( + model, fallback, mets_by=mets_by, compartment=compartment + ) + if met is not None: + coeff = 1.0 + if met is None: + met = _resolve_metabolite( + model, + token, + mets_by=mets_by, + compartment=compartment, + allow_new_mets=allow_new_mets, + new_met_prefix=new_met_prefix, + ) + coeffs[met] = coeffs.get(met, 0.0) + sign * coeff + # Drop metabolites that net to zero (present as both substrate and product). + coeffs = OrderedDict((met, c) for met, c in coeffs.items() if c != 0.0) + if had_terms and not coeffs: + warnings.warn( + f"Equation {equation!r} has no net metabolites (all terms cancelled); " + "the reaction will be added with empty stoichiometry.", + stacklevel=4, + ) + return dict(coeffs), reversible + + +def add_reactions_from_equations( + model: cobra.Model, + reactions: Sequence[Mapping], + *, + mets_by: str = "id", + compartment: str | None = None, + allow_new_mets: bool = True, + allow_new_genes: bool = True, + new_met_prefix: str = "m", +) -> list[Reaction]: + """Add reactions defined by equation strings, matching mets by ID or name. + Parameters + ---------- + model + Target ``cobra.Model``, mutated in place. + reactions + Sequence of mappings, one per reaction. Recognised keys: + + * ``id`` (**required**) — reaction ID; must not already exist. + * ``equation`` (**required**) — e.g. ``"atp_c + h2o_c <=> adp_c + pi_c"``. + Use ``<=>`` for reversible, ``-->``/``->``/``=>`` for irreversible. + * ``name`` — reaction name. + * ``bounds`` — ``(lower, upper)`` tuple; overrides the arrow. + * ``gene_reaction_rule`` — GPR string. + * ``subsystem`` — subsystem name. + mets_by + How bare equation tokens (without ``[comp]``) are matched: + ``"id"`` (RAVEN eqnType 1) or ``"name"`` (eqnType 2). A ``name[comp]`` + token (eqnType 3) is always matched by name + compartment. + compartment + Default compartment for new metabolites and for name-matched tokens + without an explicit ``[comp]``. + allow_new_mets + If True (default), create metabolites not found. New metabolites get + ``compartment`` (id mode) or an auto ID ``m1``, ``m2``, ... (name mode). + If False, an unknown metabolite raises. + allow_new_genes + If True (default), genes in a GPR are auto-created by cobra. If False, + a GPR referencing a gene not already in the model raises. + new_met_prefix + Prefix for auto-generated metabolite IDs in name mode (default ``"m"``). + + Returns + ------- + list of cobra.Reaction + The reactions added, in input order. + """ + if mets_by not in ("id", "name"): + raise ValueError(f"mets_by must be 'id' or 'name', got {mets_by!r}") + + known_genes = {gene.id for gene in model.genes} + added: list[Reaction] = [] + + for spec in reactions: + if "id" not in spec: + raise ValueError(f"Reaction spec missing required 'id': {spec!r}") + rxn_id = spec["id"] + if rxn_id in model.reactions: + raise ValueError( + f"Reaction {rxn_id!r} already exists; use changeRxns or remove it first." + ) + if "equation" not in spec: + raise ValueError(f"Reaction {rxn_id!r} spec missing required 'equation'.") + + coeffs, reversible = _stoichiometry( + model, + spec["equation"], + mets_by=mets_by, + compartment=compartment, + allow_new_mets=allow_new_mets, + new_met_prefix=new_met_prefix, + ) + + rxn = Reaction(rxn_id, name=spec.get("name", "")) + if "bounds" in spec: + rxn.bounds = tuple(spec["bounds"]) + else: + config = cobra.Configuration() + lower = config.lower_bound if reversible else 0.0 + rxn.bounds = (lower, config.upper_bound) + if "subsystem" in spec: + rxn.subsystem = spec["subsystem"] + + model.add_reactions([rxn]) + rxn.add_metabolites(coeffs) + + rule = spec.get("gene_reaction_rule", "") + if rule: + if not allow_new_genes: + missing = sorted(set(GPR.from_string(rule).genes) - known_genes) + if missing: + raise ValueError( + f"Reaction {rxn_id!r} references genes not in the model: " + f"{missing}. Set allow_new_genes=True or add them first." + ) + rxn.gene_reaction_rule = rule + known_genes.update(gene.id for gene in rxn.genes) + + added.append(rxn) + + return added diff --git a/src/raven_python/manipulation/change.py b/src/raven_python/manipulation/change.py new file mode 100644 index 0000000..78612ba --- /dev/null +++ b/src/raven_python/manipulation/change.py @@ -0,0 +1,125 @@ +"""Change the stoichiometry of existing reactions from equation strings. + +Editing the same ``Reaction`` object changes only its stoichiometry — its id, name, +bounds, GPR, subsystem, and position are preserved automatically by cobra. + +So this port simply re-parses the equation (reusing the same metabolite +matching as :func:`~raven_python.manipulation.add.add_reactions_from_equations`, +including name and ``name[comp]`` modes that cobra lacks) and swaps the +metabolites in place. + +Like RAVEN, **bounds are left unchanged** even if the new equation's arrow +implies a different reversibility — use a bounds setter for that. +""" +from __future__ import annotations + +from collections.abc import Mapping + +import cobra +from cobra import Reaction + +from raven_python.manipulation.add import _stoichiometry + +__all__ = ["change_reaction_equations", "change_gene_reaction_rules"] + + +def change_reaction_equations( + model: cobra.Model, + equations: Mapping[str, str], + *, + mets_by: str = "id", + compartment: str | None = None, + allow_new_mets: bool = True, + new_met_prefix: str = "m", +) -> list[Reaction]: + """Replace the stoichiometry of existing reactions. + Parameters + ---------- + model + Target ``cobra.Model``, mutated in place. + equations + Mapping of ``reaction_id -> equation string``. Every ID must already + exist in the model. Equation syntax is identical to + :func:`~raven_python.manipulation.add.add_reactions_from_equations`. + mets_by, compartment, allow_new_mets, new_met_prefix + Metabolite-matching options, as in ``add_reactions_from_equations``. + + Returns + ------- + list of cobra.Reaction + The reactions changed, in input order. + + Notes + ----- + Bounds are **not** modified, matching RAVEN. Changing an equation from + ``-->`` to ``<=>`` does not by itself make the reaction reversible; adjust + the bounds separately. + """ + if mets_by not in ("id", "name"): + raise ValueError(f"mets_by must be 'id' or 'name', got {mets_by!r}") + + changed: list[Reaction] = [] + for rxn_id, equation in equations.items(): + if rxn_id not in model.reactions: + raise ValueError(f"Reaction {rxn_id!r} not found in the model.") + rxn = model.reactions.get_by_id(rxn_id) + + coeffs, _reversible = _stoichiometry( + model, + equation, + mets_by=mets_by, + compartment=compartment, + allow_new_mets=allow_new_mets, + new_met_prefix=new_met_prefix, + ) + + rxn.subtract_metabolites(dict(rxn.metabolites), combine=True) + rxn.add_metabolites(coeffs) + changed.append(rxn) + + return changed + + +def change_gene_reaction_rules( + model: cobra.Model, + rules: Mapping[str, str], + *, + replace: bool = True, +) -> list[Reaction]: + """Set or append gene-reaction rules on existing reactions. + cobra already does the heavy lifting on assignment to + ``reaction.gene_reaction_rule``: it auto-creates any new ``Gene`` objects and + normalises the rule. So the value here is batching plus RAVEN's ``replace`` + option to **append** rather than overwrite. + + Parameters + ---------- + model + Target ``cobra.Model``, mutated in place. + rules + Mapping of ``reaction_id -> GPR string``. Every ID must already exist. + replace + If True (default), overwrite the existing GPR. If False, append the new + rule as an isozyme: ``(old) or (new)`` (just ``new`` if the reaction had + no GPR). + + Returns + ------- + list of cobra.Reaction + The reactions changed, in input order. + """ + changed: list[Reaction] = [] + for rxn_id, rule in rules.items(): + if rxn_id not in model.reactions: + raise ValueError(f"Reaction {rxn_id!r} not found in the model.") + rxn = model.reactions.get_by_id(rxn_id) + + if replace or not rxn.gene_reaction_rule: + new_rule = rule + else: + new_rule = f"({rxn.gene_reaction_rule}) or ({rule})" + + rxn.gene_reaction_rule = new_rule # cobra creates genes + normalises + changed.append(rxn) + + return changed diff --git a/src/raven_python/manipulation/compartments.py b/src/raven_python/manipulation/compartments.py new file mode 100644 index 0000000..091d196 --- /dev/null +++ b/src/raven_python/manipulation/compartments.py @@ -0,0 +1,196 @@ +"""Compartment manipulation — merge all compartments into one, or copy reactions to a +new compartment (ports of RAVEN's ``mergeCompartments`` and ``copyToComps``). + +Both functions are useful **independently of** :func:`raven_python.localization.predict_localization`: +``merge_compartments`` flattens a multi-compartment model for a simplified analysis +(e.g. checking whether the network can in principle make a metabolite, with no +compartment topology in the way); ``copy_to_compartment`` is a building block for +constructing dual-localised pathways. cobra has no equivalents. +""" +from __future__ import annotations + +from collections.abc import Iterable + +import cobra + +# Compartments produced by merge_compartments (RAVEN uses 's' for "system"). +_MERGED_COMPARTMENT = "s" + + +def merge_compartments( + model: cobra.Model, + *, + merged_id: str = _MERGED_COMPARTMENT, + merged_name: str = "system", + drop_single_metabolite_reactions: bool = True, + deduplicate_reactions: bool = True, +) -> tuple[cobra.Model, list[str], list[str]]: + """Merge every metabolite of ``model`` into one ``merged_id`` compartment. + + Returns ``(model_copy, deleted_single_met_reactions, deduplicated_reactions)``. The + returned model is a deep copy of the input. Use cases: + + * Check whether the network can produce/consume a metabolite at all (compartment + topology is often what makes a model look blocked). + * Simplify a model for visualisation or an analysis that doesn't care about + compartments. + * As a pre-step for localisation when the user does want RAVEN's + "start from scratch" workflow (call :func:`merge_compartments` then + :func:`raven_python.localization.predict_localization` with the full reaction list). + + Metabolites that already share a base id (e.g. ``glc__D_c`` and ``glc__D_e`` both + map to ``glc__D``) collapse into one entity in the merged compartment; their + stoichiometric contributions are summed per reaction. Reactions that end up with + only one metabolite (e.g. ``A[c] → A[m]`` becomes ``A → A`` = nothing) are deleted + by default (RAVEN's ``deleteRxnsWithOneMet``). Reactions that become identical + after merging are deduplicated (one survives). + """ + out = model.copy() + + # 1. For each metabolite, derive a base id (strip the trailing _). + # Two mets in different compartments sharing the base id collapse to one. + new_to_old: dict[str, list[cobra.Metabolite]] = {} + for m in list(out.metabolites): + base = _base_id(m) + new_to_old.setdefault(base, []).append(m) + + # 2. Build the merged metabolites and rewrite reactions. + canonical: dict[str, cobra.Metabolite] = {} + for base, mets in new_to_old.items(): + proto = mets[0] + new_met = cobra.Metabolite(base, name=proto.name, compartment=merged_id, + formula=proto.formula, charge=proto.charge) + new_met.notes = dict(proto.notes or {}) + canonical[base] = new_met + + # Rewrite all reactions: replace each metabolite with its canonical, summing + # coefficients where multiple original mets collapse to one. + rewritten: dict[str, dict[str, float]] = {} + for r in list(out.reactions): + new_stoich: dict[cobra.Metabolite, float] = {} + for m, coeff in list(r.metabolites.items()): + canon = canonical[_base_id(m)] + new_stoich[canon] = new_stoich.get(canon, 0.0) + coeff + # Drop zero net coefficients (substrate + product of the same base met cancel). + new_stoich = {m: c for m, c in new_stoich.items() if c != 0.0} + rewritten[r.id] = {m.id: c for m, c in new_stoich.items()} + + # Now build a fresh model with the canonical mets + rewritten reactions; the + # cobra in-place rewrite would require careful constraint surgery, so a clean + # rebuild is simpler and less error-prone. + merged = cobra.Model(out.id or "merged") + merged.compartments = {merged_id: merged_name} + merged.add_metabolites(list(canonical.values())) + deleted_single: list[str] = [] + deduplicated: list[str] = [] + seen_signatures: dict[tuple, str] = {} + keep_reactions: list[cobra.Reaction] = [] + for r in out.reactions: + stoich = rewritten[r.id] + if drop_single_metabolite_reactions and len(stoich) <= 1: + deleted_single.append(r.id) + continue + if not stoich: # everything cancelled + deleted_single.append(r.id) + continue + sig = (frozenset(stoich.items()), bool(r.lower_bound < 0), bool(r.upper_bound > 0)) + if deduplicate_reactions and sig in seen_signatures: + deduplicated.append(r.id) + continue + seen_signatures[sig] = r.id + new_r = cobra.Reaction(r.id, name=r.name, lower_bound=r.lower_bound, + upper_bound=r.upper_bound) + new_r.add_metabolites({merged.metabolites.get_by_id(mid): c for mid, c in stoich.items()}) + new_r.gene_reaction_rule = r.gene_reaction_rule + if r.subsystem: + new_r.subsystem = r.subsystem + new_r.notes = dict(r.notes or {}) + keep_reactions.append(new_r) + merged.add_reactions(keep_reactions) + return merged, deleted_single, deduplicated + + +def copy_to_compartment( + model: cobra.Model, + reactions: Iterable[str], + target_compartment: str, + *, + target_compartment_name: str | None = None, + delete_original: bool = False, + id_suffix: str | None = None, +) -> tuple[cobra.Model, list[str], list[str]]: + """Copy a set of reactions into ``target_compartment``. RAVEN's ``copyToComps``. + + Returns ``(model_copy, new_reaction_ids, new_metabolite_ids)``. Use cases: + + * Build a dual-localised pathway (e.g. duplicate glycolysis into a peroxisome). + * Mirror a curated subsystem into an additional compartment as a draft to refine. + * Set up the input for a flux comparison between alternate compartmentalisations. + + Each copied reaction is given the id ``"_"`` (default + ``id_suffix=target_compartment``); each metabolite it touches is mapped to (or + created in) ``target_compartment`` with the same suffix convention. ``delete_original=True`` + moves the reactions instead of copying. + """ + out = model.copy() + suffix = id_suffix if id_suffix is not None else target_compartment + if target_compartment not in out.compartments: + out.compartments = {**out.compartments, + target_compartment: target_compartment_name or target_compartment} + + preexisting_met_ids = {x.id for x in out.metabolites} + new_rxn_ids: list[str] = [] + for rid in list(reactions): + if rid not in out.reactions: + raise ValueError(f"reaction {rid!r} not in model") + src = out.reactions.get_by_id(rid) + new_id = f"{rid}_{suffix}" + if new_id in out.reactions: + continue # already copied; idempotent + new_stoich: dict[cobra.Metabolite, float] = {} + for m, coeff in src.metabolites.items(): + target_met = _met_in_compartment(out, m, target_compartment, suffix=suffix) + new_stoich[target_met] = coeff + new_r = cobra.Reaction(new_id, name=src.name, + lower_bound=src.lower_bound, upper_bound=src.upper_bound) + new_r.add_metabolites(new_stoich) + new_r.gene_reaction_rule = src.gene_reaction_rule + if src.subsystem: + new_r.subsystem = src.subsystem + new_r.notes = dict(src.notes or {}) + out.add_reactions([new_r]) + new_rxn_ids.append(new_id) + if delete_original: + out.remove_reactions([src.id], remove_orphans=False) + + new_met_ids = [m.id for m in out.metabolites if m.id not in preexisting_met_ids] + return out, new_rxn_ids, new_met_ids + + +# ----------------------------------------------------------------- helpers + +def _base_id(m: cobra.Metabolite) -> str: + """Strip the trailing ``_`` suffix from a metabolite id (if present).""" + if m.compartment and m.id.endswith(f"_{m.compartment}"): + return m.id[: -(len(m.compartment) + 1)] + return m.id + + +def _met_in_compartment(model: cobra.Model, source: cobra.Metabolite, + compartment: str, *, suffix: str | None = None) -> cobra.Metabolite: + """Return (creating if needed) the copy of ``source`` in ``compartment``. + + The new metabolite id is ``"_"`` (default ``suffix=compartment``). + Already-existing copies are reused. + """ + if source.compartment == compartment: + return source + base = _base_id(source) + new_id = f"{base}_{suffix if suffix is not None else compartment}" + if new_id in model.metabolites: + return model.metabolites.get_by_id(new_id) + new_met = cobra.Metabolite(new_id, name=source.name, compartment=compartment, + formula=source.formula, charge=source.charge) + new_met.notes = dict(source.notes or {}) + model.add_metabolites([new_met]) + return new_met diff --git a/src/raven_python/manipulation/expand.py b/src/raven_python/manipulation/expand.py new file mode 100644 index 0000000..246f3b9 --- /dev/null +++ b/src/raven_python/manipulation/expand.py @@ -0,0 +1,124 @@ +"""Expand reactions with isozymes into one reaction per isozyme. + +Operates on cobra's GPR AST, so the model stays a plain ``cobra.Model`` throughout. + +Provenance: this implementation was first written for geckopy +(``geckopy/ec_model/pipeline/expand.py``, where it backed makeEcModel stage 5) +and is adopted here as its canonical home; geckopy will import it from raven_python +once raven_python is published. + +MATLAB-COMPAT: GECKO MATLAB and RAVEN ``expandModel.m`` use string manipulation +on grRules to detect and split isozymes. raven_python uses cobrapy's GPR AST +instead. Output should be equivalent for any well-formed GPR; cases that differ +are likely malformed GPR strings that the AST flags as invalid. +""" +from __future__ import annotations + +import ast +import copy + +import cobra +from cobra.core.gene import GPR + + +def _gpr_to_dnf(gpr: GPR) -> list[list[str]]: + """Convert a GPR to disjunctive normal form (list of AND-clauses). + + An empty GPR yields an empty list. A single clause (no OR anywhere) + yields a list of length 1. OR-of-ANDs yields one sublist per + disjunct, each containing the gene names ANDed together. + + Handles distributivity: ``g1 and (g2 or g3)`` becomes + ``[[g1, g2], [g1, g3]]``. + """ + if gpr is None or gpr.body is None: + return [] + return _node_to_dnf(gpr.body) + + +def _node_to_dnf(node) -> list[list[str]]: + """Recursive helper. Returns DNF as list of AND-clauses.""" + if isinstance(node, ast.Name): + return [[node.id]] + if isinstance(node, ast.BoolOp): + if isinstance(node.op, ast.Or): + result: list[list[str]] = [] + for child in node.values: + result.extend(_node_to_dnf(child)) + return result + if isinstance(node.op, ast.And): + clauses: list[list[str]] = [[]] + for child in node.values: + child_dnf = _node_to_dnf(child) + new_clauses: list[list[str]] = [] + for existing in clauses: + for extra in child_dnf: + new_clauses.append(existing + extra) + clauses = new_clauses + return clauses + raise ValueError(f"Unexpected GPR node type: {type(node).__name__}") + + +def expand_model(model: cobra.Model) -> list[str]: + """Split reactions with isozymes (OR in GPR) into one reaction per isozyme. + For each reaction whose GPR contains at least one OR, the reaction + is removed and replaced by one copy per disjunctive clause. The new + reactions get ID suffix ``_EXP_1``, ``_EXP_2``, etc. All other + fields (stoichiometry, bounds, name, subsystem) are copied verbatim; + only the GPR is simplified to the single AND-clause for that + isozyme. + + Reactions with no GPR, or with a GPR that has no OR, are left + untouched. + + Parameters + ---------- + model + A cobra.Model, mutated in place. + + Returns + ------- + list of str + Sorted IDs of newly added expanded reactions (those with + ``_EXP_N`` suffixes). The original reactions that were split + are no longer in the model. + """ + expansions: list[tuple[cobra.Reaction, list[list[str]]]] = [] + + for rxn in model.reactions: + if not rxn.gene_reaction_rule: + continue + clauses = _gpr_to_dnf(rxn.gpr) + if len(clauses) <= 1: + continue + expansions.append((rxn, clauses)) + + added_ids: list[str] = [] + for original_rxn, clauses in expansions: + new_rxns: list[cobra.Reaction] = [] + for i, clause in enumerate(clauses, start=1): + new_rxn = cobra.Reaction( + id=f"{original_rxn.id}_EXP_{i}", + name=original_rxn.name, + ) + new_rxn.lower_bound = original_rxn.lower_bound + new_rxn.upper_bound = original_rxn.upper_bound + new_rxn.add_metabolites(dict(original_rxn.metabolites.items())) + new_rxn.subsystem = original_rxn.subsystem + new_rxn.gene_reaction_rule = " and ".join(clause) + # Propagate per-reaction metadata (notably ec-code / annotations) + # so downstream functions see the same annotations on expanded + # reactions as on the original. Deep-copy so siblings are independent. + new_rxn.annotation = copy.deepcopy(original_rxn.annotation) + new_rxn.notes = copy.deepcopy(original_rxn.notes) + new_rxns.append(new_rxn) + + obj_coeff = original_rxn.objective_coefficient + model.remove_reactions([original_rxn]) + model.add_reactions(new_rxns) + if obj_coeff: # keep the original in the objective — sum over its isozyme copies + for new_rxn in new_rxns: + new_rxn.objective_coefficient = obj_coeff + added_ids.extend(r.id for r in new_rxns) + + return sorted(added_ids) diff --git a/src/raven_python/manipulation/irreversible.py b/src/raven_python/manipulation/irreversible.py new file mode 100644 index 0000000..3f64a68 --- /dev/null +++ b/src/raven_python/manipulation/irreversible.py @@ -0,0 +1,72 @@ +"""Convert reversible reactions to an irreversible (forward + reverse) form. + +cobrapy's own ``convert_to_irreversible`` was removed, so this is a genuine +implementation rather than a wrapper. + +Provenance: first written for geckopy +(``geckopy/ec_model/pipeline/preprocess.py``, makeEcModel stage 4, tagged +"RAVENpy candidate") and adopted here as its canonical home; geckopy will +import it from raven_python once raven_python is published. +""" +from __future__ import annotations + +import cobra + + +def convert_to_irreversible(model: cobra.Model) -> list[str]: + """Split non-exchange reversible reactions into a forward + reverse pair. + For each non-exchange reaction with ``lb < 0``: + + - The original reaction is kept as the forward direction. Its + lower bound is clamped to 0. + - A new reaction with the same ID plus a ``_REV`` suffix is added, + representing the reverse direction. Its stoichiometry is the + negation of the original, its bounds are ``(0, -original_lb)``, + and it inherits the name (with " (reversible)" appended) and the + gene-protein rule of the original. + + Exchange reactions (boundary reactions) are never split, regardless + of their bounds, matching MATLAB behavior where exchange reactions + are explicitly excluded from ``convertToIrrev``. + + Parameters + ---------- + model + A cobra.Model, mutated in place. + + Returns + ------- + list of str + Sorted IDs of newly added reverse reactions (the ones ending in + ``_REV``). The forward reactions retain their original IDs. + """ + reverse_rxns_to_add: list[cobra.Reaction] = [] + forward_updates: list[cobra.Reaction] = [] + + for rxn in model.reactions: + if rxn.boundary: + continue + if rxn.lower_bound >= 0: + continue + + original_lb = rxn.lower_bound + + rev_rxn = cobra.Reaction( + id=f"{rxn.id}_REV", + name=(f"{rxn.name} (reversible)" if rxn.name else f"{rxn.id}_REV"), + ) + rev_rxn.lower_bound = 0.0 + rev_rxn.upper_bound = -original_lb + rev_rxn.add_metabolites({m: -c for m, c in rxn.metabolites.items()}) + rev_rxn.gene_reaction_rule = rxn.gene_reaction_rule + + reverse_rxns_to_add.append(rev_rxn) + forward_updates.append(rxn) + + for rxn in forward_updates: + rxn.lower_bound = 0.0 + + if reverse_rxns_to_add: + model.add_reactions(reverse_rxns_to_add) + + return sorted(r.id for r in reverse_rxns_to_add) diff --git a/src/raven_python/manipulation/merge.py b/src/raven_python/manipulation/merge.py new file mode 100644 index 0000000..bfa1f24 --- /dev/null +++ b/src/raven_python/manipulation/merge.py @@ -0,0 +1,146 @@ +"""Merge several models into one. + +cobra's ``Model.merge`` is pairwise and matches everything strictly by id; this +merges **N** models and unifies metabolites by **name[compartment]** (so the same +compound under different ids in two models becomes one), while adding **all** +reactions without de-duplication +(a reaction whose ID already exists is renamed ``id_``). Genes are +unified by ID. Provenance (which source model each object came from) is recorded +in ``notes['origin']``. + +The bulk of RAVEN's function is struct field-padding and manual S-matrix +assembly, none of which is needed on ``cobra.Model``. +""" +from __future__ import annotations + +import copy +import warnings +from collections.abc import Iterable + +import cobra +from cobra import Metabolite, Model, Reaction + + +def _unique_id(existing, base: str, suffix: str) -> str: + """Return base, or base_suffix (then base_suffix_2, ...) if it collides.""" + if base not in existing: + return base + candidate = f"{base}_{suffix}" + n = 2 + while candidate in existing: + candidate = f"{base}_{suffix}_{n}" + n += 1 + return candidate + + +def merge_models( + models: Iterable[cobra.Model], + *, + match_by: str = "name", + track_origin: bool = True, +) -> cobra.Model: + """Merge models into a single new model. + Parameters + ---------- + models + The models to merge (two or more). A single model is returned as a copy. + match_by + How metabolites are unified across models: ``"name"`` (default) treats + metabolites with the same *name and compartment* as identical (IDs + ignored); ``"id"`` matches by metabolite ID. + track_origin + If True (default), record the source model's ``id`` in each reaction's, + metabolite's, and gene's ``notes['origin']``. + + Returns + ------- + cobra.Model + A new merged model (``id="MERGED"``). Reactions are **not** de-duplicated + — matching RAVEN, every reaction from every model is kept, with ID + collisions renamed ``id_``. + """ + models = list(models) + if not models: + raise ValueError("merge_models requires at least one model.") + if match_by not in ("name", "id"): + raise ValueError(f"match_by must be 'name' or 'id', got {match_by!r}") + if len(models) == 1: + return models[0].copy() + + merged = Model("MERGED") + comp_names: dict[str, str] = {} + met_lookup: dict = {} # name/comp or id key -> merged Metabolite + + def met_key(met: Metabolite): + return (met.name, met.compartment) if match_by == "name" else met.id + + def ensure_metabolite(src: Metabolite, origin: str) -> Metabolite: + key = met_key(src) + if key in met_lookup: + existing = met_lookup[key] + # Two source models can map to the same name[comp] (or id) with + # different formula/charge; silently picking the first-seen has + # quietly corrupted mass balance in the past. Warn so the caller + # sees the conflict. + if src.formula and existing.formula and src.formula != existing.formula: + warnings.warn( + f"merge_models: metabolite {existing.id!r} (from earlier model) " + f"and {src.id!r} (from {origin!r}) share key {key!r} but " + f"have different formulas ({existing.formula!r} vs {src.formula!r}); " + "keeping the first.", + stacklevel=3, + ) + if ( + existing.charge is not None + and src.charge is not None + and existing.charge != src.charge + ): + warnings.warn( + f"merge_models: metabolite {existing.id!r} (from earlier model) " + f"and {src.id!r} (from {origin!r}) share key {key!r} but " + f"have different charges ({existing.charge} vs {src.charge}); " + "keeping the first.", + stacklevel=3, + ) + return existing + new_id = _unique_id(merged.metabolites, src.id, origin) + new_met = Metabolite( + new_id, name=src.name, compartment=src.compartment, + formula=src.formula, charge=src.charge, + ) + new_met.annotation = copy.deepcopy(src.annotation) + new_met.notes = copy.deepcopy(src.notes) + if track_origin: + new_met.notes.setdefault("origin", origin) + merged.add_metabolites([new_met]) + met_lookup[key] = new_met + return new_met + + for model in models: + origin = model.id or "model" + comp_names.update(model.compartments) + genes_before = {g.id for g in merged.genes} + + for rxn in model.reactions: + new_id = _unique_id(merged.reactions, rxn.id, origin) + new_rxn = Reaction(new_id, name=rxn.name) + new_rxn.bounds = rxn.bounds + new_rxn.subsystem = rxn.subsystem + merged.add_reactions([new_rxn]) + new_rxn.add_metabolites( + {ensure_metabolite(m, origin): coef for m, coef in rxn.metabolites.items()} + ) + if rxn.gene_reaction_rule: + new_rxn.gene_reaction_rule = rxn.gene_reaction_rule + new_rxn.annotation = copy.deepcopy(rxn.annotation) + new_rxn.notes = copy.deepcopy(rxn.notes) + if track_origin: + new_rxn.notes.setdefault("origin", origin) + + if track_origin: + for gene in merged.genes: + if gene.id not in genes_before: + gene.notes.setdefault("origin", origin) + + merged._compartments.update(comp_names) + return merged diff --git a/src/raven_python/manipulation/parameters.py b/src/raven_python/manipulation/parameters.py new file mode 100644 index 0000000..f349804 --- /dev/null +++ b/src/raven_python/manipulation/parameters.py @@ -0,0 +1,78 @@ +"""Set reaction bounds to a sign-aware ±% variance band around measured values. + +Cobra has no idiom for the *variance band* case (e.g. "5 ± 20 %"); the other common +bound-setting cases are cobra one-liners: + +* fixed lb / ub → ``reaction.lower_bound`` / ``upper_bound`` / ``reaction.bounds`` +* equality → ``reaction.bounds = (v, v)`` +* objective → ``model.objective = {reaction: coeff}`` +* unconstrained → ``reaction.bounds = cobra.Configuration().bounds`` +""" +from __future__ import annotations + +from collections.abc import Iterable, Sequence + +import cobra +from cobra import Reaction + +Number = int | float + + +def _resolve(model: cobra.Model, reactions) -> list[Reaction]: + if isinstance(reactions, (str, Reaction)): + reactions = [reactions] + out: list[Reaction] = [] + for r in reactions: + if isinstance(r, Reaction): + out.append(r) + elif r in model.reactions: + out.append(model.reactions.get_by_id(r)) + else: + raise ValueError(f"Reaction {r!r} not found in the model.") + return out + + +def _broadcast(value, n: int) -> list[float]: + if isinstance(value, (int, float)): + return [float(value)] * n + vals = [float(v) for v in value] + if len(vals) != n: + raise ValueError( + f"Expected 1 or {n} values to match the reactions, got {len(vals)}." + ) + return vals + + +def set_variance_bounds( + model: cobra.Model, + reactions: str | Reaction | Iterable, + values: Number | Sequence[Number], + percent: Number, +) -> list[Reaction]: + """Constrain reactions to a ``±percent/2`` band around measured values. + + For a measured value ``v`` and ``percent`` ``p``, the bounds become + ``v * (1 - p/200) .. v * (1 + p/200)`` — i.e. ``percent`` is the *total* + width, split half above and half below. For a negative ``v`` the two are + swapped so that ``lb <= ub``. E.g. ``percent=5`` gives 97.5 %..102.5 % of ``v``. + + Parameters + ---------- + reactions + Reaction IDs or objects. + values + Measured value per reaction; a scalar is broadcast to all reactions. + percent + Total band width as a percentage. + + Returns + ------- + list of cobra.Reaction + The reactions affected. + """ + rxns = _resolve(model, reactions) + half = percent / 200.0 + for rxn, v in zip(rxns, _broadcast(values, len(rxns)), strict=True): + lo, hi = v * (1 - half), v * (1 + half) + rxn.bounds = (hi, lo) if v < 0 else (lo, hi) + return rxns diff --git a/src/raven_python/manipulation/remove.py b/src/raven_python/manipulation/remove.py new file mode 100644 index 0000000..492de36 --- /dev/null +++ b/src/raven_python/manipulation/remove.py @@ -0,0 +1,120 @@ +"""Remove metabolites or genes from a model. + +For removing *reactions*, use cobra directly: +``cobra.Model.remove_reactions(reactions, remove_orphans=...)``. + +The two functions here delegate the core to cobra and add the cobra-absent behaviour: + +* ``remove_metabolites`` — cobra matches metabolites by ID; RAVEN's ``isNames`` + deletes a metabolite in **every compartment at once** by name. That name + resolution is the *sole* reason this wrapper exists (see the note on it). +* ``remove_genes`` — cobra's ``cobra.manipulation.remove_genes`` already rewrites + GPRs through the boolean AST (removing one gene of ``A and B`` empties the + rule, of ``A or B`` keeps the other) — exactly RAVEN's intent, without its + ``eval``. The gap is RAVEN's default of **constraining** flux-blocked reactions + to zero instead of deleting them; exposed as ``blocked_reactions``. +""" +from __future__ import annotations + +from collections.abc import Iterable + +import cobra +from cobra import Gene, Metabolite +from cobra.manipulation import remove_genes as _cobra_remove_genes + + +def _as_list(obj) -> list: + if isinstance(obj, (str, Metabolite, Gene)): + return [obj] + return list(obj) + + +def remove_metabolites( + model: cobra.Model, + metabolites: str | Metabolite | Iterable, + *, + by_name: bool = False, + destructive: bool = False, +) -> None: + """Remove metabolites, optionally matching by name across all compartments. + + Parameters + ---------- + by_name + If True, ``metabolites`` are metabolite *names*; every metabolite with a + matching name is removed, regardless of compartment (RAVEN ``isNames``). + If False, they are IDs/objects, resolved via cobra. + destructive + Passed to cobra: if True, also remove every reaction the metabolite + participates in. + + Note + ---- + With ``by_name=False`` this is just ``model.remove_metabolites`` — the + ``by_name`` cross-compartment deletion is the only thing this adds over cobra. + """ + if by_name: + wanted = set(_as_list(metabolites)) + targets = [m for m in model.metabolites if m.name in wanted] + else: + targets = model.metabolites.get_by_any(_as_list(metabolites)) + if targets: + model.remove_metabolites(targets, destructive=destructive) + + +def remove_genes( + model: cobra.Model, + genes: str | Gene | Iterable, + *, + blocked_reactions: str = "remove", + remove_orphans: bool = False, +) -> list[str]: + """Remove genes and handle reactions left unable to carry flux. + + GPR rewriting (with correct AND/OR semantics) and gene deletion are done by cobra; + this adds a policy for reactions whose GPR becomes empty (no enzyme left): + + * ``"remove"`` — delete them (cobra's default). + * ``"constrain"`` — keep them but set bounds to ``(0, 0)``. + * ``"keep"`` — leave them with an empty GPR and unchanged bounds. + + ``remove_orphans`` (only meaningful with ``blocked_reactions="remove"``) + passes through to cobra: drop metabolites *and* genes orphaned by the removal. + + Returns + ------- + list of str + IDs of the reactions that became flux-blocked (had a GPR, now empty). + """ + if blocked_reactions not in ("remove", "constrain", "keep"): + raise ValueError( + f"blocked_reactions must be 'remove', 'constrain', or 'keep', " + f"got {blocked_reactions!r}" + ) + + # Resolve to gene IDs that are actually in the model (RAVEN filters likewise). + requested = [g.id if isinstance(g, Gene) else g for g in _as_list(genes)] + present = [gid for gid in requested if gid in model.genes] + if not present: + return [] + + # Reactions touched by these genes that currently have a GPR. + affected = set() + for gid in present: + affected.update(r.id for r in model.genes.get_by_id(gid).reactions) + had_gpr = {rid for rid in affected if model.reactions.get_by_id(rid).gene_reaction_rule} + + # cobra rewrites GPRs (AST) and removes the gene objects; we manage reactions. + _cobra_remove_genes(model, present, remove_reactions=False) + + blocked = [ + rid for rid in had_gpr if not model.reactions.get_by_id(rid).gene_reaction_rule + ] + + if blocked_reactions == "remove": + model.remove_reactions(blocked, remove_orphans=remove_orphans) + elif blocked_reactions == "constrain": + for rid in blocked: + model.reactions.get_by_id(rid).bounds = (0, 0) + + return sorted(blocked) diff --git a/src/raven_python/manipulation/simplify.py b/src/raven_python/manipulation/simplify.py new file mode 100644 index 0000000..2deaccd --- /dev/null +++ b/src/raven_python/manipulation/simplify.py @@ -0,0 +1,229 @@ +"""Reduce a model by removing/merging reactions that cannot carry flux. + +Four reduction modes that cobra does not cover out of the box: +``remove_dead_end_reactions`` (reactions whose substrates have no producer), +``remove_duplicate_reactions``, ``constrain_reversible_reactions`` (tighten bounds +via FVA), and ``group_linear_reactions`` (lossy fold of unit-stoichiometry chains +into one reaction; drops gene rules). + +Cobra-covered modes that you'd reach for separately: + +* No-flux removal → ``cobra.flux_analysis.find_blocked_reactions``. +* Zero-interval removal → filter reactions with ``bounds == (0, 0)`` then prune. +""" +from __future__ import annotations + +import math +from collections.abc import Iterable + +import cobra +from cobra.flux_analysis import flux_variability_analysis + +from raven_python.manipulation.irreversible import convert_to_irreversible + + +def _prune_orphan_metabolites(model: cobra.Model) -> list[str]: + orphans = [m for m in model.metabolites if not m.reactions] + if orphans: + model.remove_metabolites(orphans) + return [m.id for m in orphans] + + +def _can_produce_and_consume(met) -> tuple[bool, bool]: + """Whether the network can both produce and consume ``met`` (given directions).""" + produce = consume = False + for rxn in met.reactions: + coef = rxn.get_coefficient(met) + if coef > 0: + produce |= rxn.upper_bound > 0 + consume |= rxn.lower_bound < 0 + elif coef < 0: + consume |= rxn.upper_bound > 0 + produce |= rxn.lower_bound < 0 + return produce, consume + + +def remove_dead_end_reactions( + model: cobra.Model, *, reserved: Iterable[str] | None = None +) -> tuple[list[str], list[str]]: + """Iteratively remove dead-end reactions and metabolites. + + A metabolite + is a dead end if it participates in only one reaction, or if (accounting for + reaction directionality) it can only be produced or only consumed — such + metabolites cannot carry steady-state flux, so the reactions touching them + are removed. Repeats until stable. + + Returns ``(removed_reaction_ids, removed_metabolite_ids)``. + """ + reserved = set(reserved or []) + removed_rxns: list[str] = [] + removed_mets: list[str] = [] + while True: + removed_mets += _prune_orphan_metabolites(model) + dead = [ + m + for m in model.metabolites + if len(m.reactions) <= 1 or not all(_can_produce_and_consume(m)) + ] + if not dead: + break + rxns = {r for m in dead for r in m.reactions} + to_delete = [r for r in rxns if r.id not in reserved] + if not to_delete: + break + removed_rxns += [r.id for r in to_delete] + model.remove_reactions(to_delete) + return removed_rxns, removed_mets + + +def _signature(rxn): + mets = frozenset((m.id, c) for m, c in rxn.metabolites.items()) + return (mets, rxn.lower_bound, rxn.upper_bound, rxn.objective_coefficient) + + +def remove_duplicate_reactions( + model: cobra.Model, *, reserved: Iterable[str] | None = None +) -> list[str]: + """Remove all-but-one of each set of duplicate reactions. + + Reactions are duplicates when they have identical stoichiometry, bounds, and + objective coefficient. One of each set is kept (reserved reactions are never + removed). Returns the removed reaction IDs. + """ + reserved = set(reserved or []) + groups: dict = {} + for rxn in model.reactions: + groups.setdefault(_signature(rxn), []).append(rxn) + + removed: list[str] = [] + for rxns in groups.values(): + if len(rxns) <= 1: + continue + keep = rxns[-1] + to_remove = [r for r in rxns if r is not keep and r.id not in reserved] + if to_remove: + removed += [r.id for r in to_remove] + model.remove_reactions(to_remove) + return removed + + +def constrain_reversible_reactions( + model: cobra.Model, *, eps: float = 1e-9 +) -> list[str]: + """Constrain reversible reactions that can only carry flux one way. + + Runs FVA on + each reversible reaction; if it can only carry forward flux its lower bound + is set to 0, and if it can only carry reverse flux it is flipped to a forward + reaction (stoichiometry, bounds, and objective negated). Returns the changed + reaction IDs. + """ + revs = [r for r in model.reactions if r.lower_bound < 0 < r.upper_bound] + if not revs: + return [] + # Infeasible models surface as either OptimizationError (Gurobi/HiGHS) or + # NaN-filled ranges (some optlang backends silently). Catch both and raise + # a single clear error — the original ``abs(NaN) < eps`` comparison would + # have silently no-op'd, letting bogus "all reactions truly reversible" + # decisions sneak through. + try: + fva = flux_variability_analysis( + model, reaction_list=revs, fraction_of_optimum=0.0 + ) + except Exception as exc: # noqa: BLE001 - solver-family agnostic + raise RuntimeError( + "constrain_reversible_reactions: FVA failed — the model is likely " + "infeasible at fraction_of_optimum=0. Fix the infeasibility first " + "(often a missing exchange or an over-constrained essential). " + f"({exc})" + ) from exc + if fva[["minimum", "maximum"]].isna().any().any(): + raise RuntimeError( + "constrain_reversible_reactions: FVA returned NaN ranges — the " + "model is infeasible at fraction_of_optimum=0. Fix the infeasibility " + "first (often a missing exchange or an over-constrained essential)." + ) + + changed: list[str] = [] + for rxn in revs: + lo = fva.at[rxn.id, "minimum"] + hi = fva.at[rxn.id, "maximum"] + # Guard against ±inf ranges (unbounded objective): treat them as truly + # reversible rather than "zero" by the abs(·) < eps check. + if math.isinf(lo) or math.isinf(hi): + continue + min_zero, max_zero = abs(lo) < eps, abs(hi) < eps + if min_zero == max_zero: # both ~0 (blocked) or both nonzero (truly reversible) + continue + if max_zero: # only reverse flux → flip to a forward reaction + old_lb = rxn.lower_bound + rxn.add_metabolites({m: -2 * c for m, c in rxn.metabolites.items()}) + rxn.bounds = (0.0, -old_lb) + rxn.objective_coefficient = -rxn.objective_coefficient + else: # only forward flux + rxn.lower_bound = 0.0 + changed.append(rxn.id) + return changed + + +def group_linear_reactions( + model: cobra.Model, *, reserved: Iterable[str] | None = None +) -> None: + """Merge linear (single-producer, single-consumer) reaction chains. + + **Lossy**: gene-reaction + associations are discarded (RAVEN does the same), since merged reactions have + no meaningful combined GPR. The model is first made irreversible, then any + metabolite that is produced by exactly one reaction and consumed by exactly + one reaction is eliminated by merging the two reactions. Mutates in place. + """ + reserved = set(reserved or []) + + # Lossy: drop all gene information. + for rxn in model.reactions: + rxn.gene_reaction_rule = "" + for gene in list(model.genes): + model.genes.remove(gene) + + convert_to_irreversible(model) + + # Worklist of metabolites to (re)consider for merging. Each metabolite + # participating in a merge can expose new linear chains in its neighbours, + # so we re-enqueue the touched mets rather than restart the whole scan + # (the old O(n²·m) restart-after-every-merge loop). + pending: list = list(model.metabolites) + seen_in_pass: set = set() + while pending: + met = pending.pop() + if met not in model.metabolites: # removed in a previous merge + continue + rxns = list(met.reactions) + if len(rxns) != 2 or any(r.id in reserved for r in rxns): + continue + r1, r2 = rxns + c1, c2 = r1.get_coefficient(met), r2.get_coefficient(met) + if (c1 > 0) == (c2 > 0): # need one producer and one consumer + continue + ratio = abs(c1 / c2) + new_lb = max(r1.lower_bound, r2.lower_bound / ratio) + new_ub = min(r1.upper_bound, r2.upper_bound / ratio) + new_obj = r1.objective_coefficient + r2.objective_coefficient * ratio + # Re-enqueue every metabolite touched by either side — the merge can + # turn neighbours into single-producer/consumer chains in turn. + touched = {m for m in r1.metabolites} | {m for m in r2.metabolites} + # Merge r2*ratio into r1; the shared metabolite cancels and is dropped. + r1.add_metabolites({m: c * ratio for m, c in r2.metabolites.items()}) + model.remove_reactions([r2]) + r1.bounds = (new_lb, new_ub) + r1.objective_coefficient = new_obj + seen_in_pass.clear() + for m in touched: + if m in model.metabolites and id(m) not in seen_in_pass: + seen_in_pass.add(id(m)) + pending.append(m) + # One terminal cleanup pass (cheap; only what remains). + empty = [r for r in model.reactions if not r.metabolites] + if empty: + model.remove_reactions(empty) + _prune_orphan_metabolites(model) diff --git a/src/raven_python/manipulation/transfer.py b/src/raven_python/manipulation/transfer.py new file mode 100644 index 0000000..b867f02 --- /dev/null +++ b/src/raven_python/manipulation/transfer.py @@ -0,0 +1,144 @@ +"""Copy reactions (with their metabolites and genes) from another model. + +cobra's ``Model.merge`` / ``add_reactions`` match metabolites strictly by id. This +transfers a chosen set of reactions from a *source* model into a draft, matching +metabolites by **name[compartment]** instead — so a compound present in both models +under different ids is reused rather than duplicated, and only genuinely new +metabolites are created (copying the source's id, formula, +charge, and annotation). New genes are auto-created by cobra when the GPR is set. +This is the post-``getModelFromHomology`` "copy a few more reactions across" +workflow. +""" +from __future__ import annotations + +import copy +from collections.abc import Iterable + +import cobra +from cobra import Metabolite, Reaction + +from raven_python.manipulation.add import _new_met_id + + +def _name_comp(met: Metabolite) -> str: + return f"{met.name}[{met.compartment}]" + + +def add_reactions_from_model( + model: cobra.Model, + source_model: cobra.Model, + reactions: str | Iterable[str], + *, + genes: bool | str | Iterable[str] = False, + note: str | None = "Added via add_reactions_from_model()", + confidence: int | None = None, +) -> list[Reaction]: + """Copy reactions from ``source_model`` into ``model``. + Parameters + ---------- + model + Draft model to copy into (mutated in place). + source_model + Model to copy reactions from. + reactions + Reaction ID(s) in ``source_model``. Reactions already present in + ``model`` (by ID) are skipped. + genes + ``False`` (default): add reactions without GPRs. ``True``: copy each + reaction's GPR from the source. A string: use it as the GPR for every + added reaction. A list: per-reaction GPRs (matching the reactions that + are actually added). New genes are created automatically. + note + Stored in each added reaction's ``notes['note']`` (set ``None`` to skip). + confidence + If given, stored in each added reaction's ``notes['confidence_score']``. + + Returns + ------- + list of cobra.Reaction + The reactions added, in input order. + """ + rxn_ids = [reactions] if isinstance(reactions, str) else list(reactions) + missing = [r for r in rxn_ids if r not in source_model.reactions] + if missing: + raise ValueError(f"Reactions not found in the source model: {missing}") + + new_ids = [r for r in rxn_ids if r not in model.reactions] + if not new_ids: + raise ValueError("All reactions are already in the model.") + source_rxns = [source_model.reactions.get_by_id(r) for r in new_ids] + + if genes is False: + rules = [""] * len(source_rxns) + elif genes is True: + rules = [r.gene_reaction_rule for r in source_rxns] + elif isinstance(genes, str): + rules = [genes] * len(source_rxns) + else: + rules = list(genes) + if len(rules) != len(source_rxns): + raise ValueError( + f"genes list has {len(rules)} rules but {len(source_rxns)} " + "reactions are being added." + ) + + # Match metabolites by name[comp]; create only the genuinely new ones. + draft_by_name = {_name_comp(m): m for m in model.metabolites} + new_mets: list[Metabolite] = [] + pending: set[str] = set() + # Track ids minted within this batch so two source mets that share an id + # but differ in name[comp] don't collide when add_metabolites runs. + pending_ids: set[str] = set() + for srx in source_rxns: + for met in srx.metabolites: + key = _name_comp(met) + if key in draft_by_name or key in pending: + continue + pending.add(key) + if met.id not in model.metabolites and met.id not in pending_ids: + new_id = met.id + else: + # _new_met_id only knows the model; loop past in-batch hits too. + new_id = _new_met_id(model, "m") + while new_id in pending_ids: + n = int(new_id[1:]) + 1 + new_id = f"m{n}" + while new_id in model.metabolites: + n += 1 + new_id = f"m{n}" + pending_ids.add(new_id) + new_met = Metabolite( + new_id, + name=met.name, + compartment=met.compartment, + formula=met.formula, + charge=met.charge, + ) + new_met.annotation = copy.deepcopy(met.annotation) + new_met.notes = copy.deepcopy(met.notes) + new_mets.append(new_met) + draft_by_name[key] = new_met + if new_mets: + model.add_metabolites(new_mets) + + added: list[Reaction] = [] + for srx, rule in zip(source_rxns, rules, strict=True): + rxn = Reaction(srx.id, name=srx.name) + rxn.bounds = srx.bounds + rxn.subsystem = srx.subsystem + model.add_reactions([rxn]) + rxn.add_metabolites( + {draft_by_name[_name_comp(met)]: coef for met, coef in srx.metabolites.items()} + ) + if rule: + rxn.gene_reaction_rule = rule + rxn.annotation = copy.deepcopy(srx.annotation) + notes = copy.deepcopy(srx.notes) + if note is not None: + notes["note"] = note + if confidence is not None: + notes["confidence_score"] = confidence + rxn.notes = notes + added.append(rxn) + + return added diff --git a/src/raven_python/manipulation/transport.py b/src/raven_python/manipulation/transport.py new file mode 100644 index 0000000..d0c1bf1 --- /dev/null +++ b/src/raven_python/manipulation/transport.py @@ -0,0 +1,157 @@ +"""Add transport reactions between compartments. + +cobra has no transport-reaction primitive. For each metabolite this matches the +species by *name* across compartments (the source in ``from_compartment`` and its +same-named twin in each target compartment), optionally creating the target +metabolite, and +builds a ``-1 from / +1 to`` reaction with a sequential ``tr_0001`` ID. +""" +from __future__ import annotations + +import re +import warnings +from collections.abc import Iterable + +import cobra +from cobra import Metabolite, Reaction + +from raven_python.manipulation.add import _new_met_id + + +def _index_by_name(mets: Iterable[Metabolite], compartment: str) -> dict[str, Metabolite]: + """Index metabolites by name, warning when a name is duplicated. + + Same-name duplicates in a single compartment are unusual but legal in cobra, + and the previous one-pass dict comprehension silently dropped all but one. + """ + out: dict[str, list[Metabolite]] = {} + for m in mets: + out.setdefault(m.name, []).append(m) + chosen: dict[str, Metabolite] = {} + for name, group in out.items(): + if len(group) > 1: + warnings.warn( + f"Multiple metabolites named {name!r} in compartment {compartment!r} " + f"({[m.id for m in group]}); using {group[0].id!r} for transport.", + stacklevel=3, + ) + chosen[name] = group[0] + return chosen + + +def _transport_id_factory(model: cobra.Model, prefix: str): + pattern = re.compile(rf"^{re.escape(prefix)}(\d+)$") + used = [int(m.group(1)) for r in model.reactions if (m := pattern.match(r.id))] + counter = max(used) + 1 if used else 1 + + def next_id() -> str: + nonlocal counter + while f"{prefix}{counter:04d}" in model.reactions: + counter += 1 + rid = f"{prefix}{counter:04d}" + counter += 1 + return rid + + return next_id + + +def add_transport_reactions( + model: cobra.Model, + from_compartment: str, + to_compartments: str | Iterable[str], + metabolite_names: str | Iterable[str] | None = None, + *, + reversible: bool = True, + only_to_existing: bool = True, + id_prefix: str = "tr_", +) -> list[Reaction]: + """Add transport reactions from one compartment to one or more others. + Parameters + ---------- + from_compartment + Source compartment id. + to_compartments + Target compartment id(s). + metabolite_names + Names of metabolites to transport. Default: every metabolite in + ``from_compartment``. + reversible + If True (default), bounds span the cobra configuration default + (reversible); otherwise lower bound 0. + only_to_existing + If True (default), only transport a metabolite into a target + compartment where a same-named metabolite already exists. If False, + create the missing target metabolite (copying name/formula/charge/ + annotation from the source) before adding the transport. + id_prefix + Prefix for the sequential reaction IDs (``tr_0001``, ...). + + Returns + ------- + list of cobra.Reaction + The transport reactions added, in creation order. + """ + # cobra's `model.compartments` only lists compartments that have metabolites; + # include registered-but-empty ones so transport can target an empty compartment. + known = set(model.compartments) | set(model._compartments) + if from_compartment not in known: + raise ValueError(f"Compartment {from_compartment!r} is not in the model.") + if isinstance(to_compartments, str): + to_compartments = [to_compartments] + else: + to_compartments = list(to_compartments) + for comp in to_compartments: + if comp not in known: + raise ValueError(f"Compartment {comp!r} is not in the model.") + + source = _index_by_name( + (m for m in model.metabolites if m.compartment == from_compartment), + from_compartment, + ) + if metabolite_names is None: + names = list(source) + else: + names = [metabolite_names] if isinstance(metabolite_names, str) else list(metabolite_names) + missing = [n for n in names if n not in source] + if missing: + raise ValueError( + f"Metabolites not found in compartment {from_compartment!r}: {missing}" + ) + + cfg = cobra.Configuration() + bounds = (cfg.lower_bound, cfg.upper_bound) if reversible else (0.0, cfg.upper_bound) + from_name = model.compartments.get(from_compartment) or from_compartment + next_id = _transport_id_factory(model, id_prefix) + + added: list[Reaction] = [] + for to_comp in to_compartments: + to_name = model.compartments.get(to_comp) or to_comp + targets = _index_by_name( + (m for m in model.metabolites if m.compartment == to_comp), + to_comp, + ) + for name in names: + src = source[name] + dst = targets.get(name) + if dst is None: + if only_to_existing: + continue + dst = Metabolite( + _new_met_id(model, "m"), + name=name, + compartment=to_comp, + formula=src.formula, + charge=src.charge, + ) + dst.annotation = dict(src.annotation) + model.add_metabolites([dst]) + targets[name] = dst + + rxn = Reaction(next_id()) + rxn.name = f"{name} transport, {from_name}-{to_name}" + rxn.bounds = bounds + model.add_reactions([rxn]) + rxn.add_metabolites({src: -1, dst: 1}) + added.append(rxn) + + return added diff --git a/tests/test_change_grrules.py b/tests/test_change_grrules.py new file mode 100644 index 0000000..d33f723 --- /dev/null +++ b/tests/test_change_grrules.py @@ -0,0 +1,49 @@ +"""Tests for change_gene_reaction_rules (changeGrRules port).""" +import cobra +import pytest + +from raven_python.manipulation import add_reactions_from_equations, change_gene_reaction_rules + + +@pytest.fixture +def model(): + m = cobra.Model("t") + m.add_metabolites( + [cobra.Metabolite("a_c", compartment="c"), cobra.Metabolite("b_c", compartment="c")] + ) + add_reactions_from_equations( + m, + [ + {"id": "R1", "equation": "a_c --> b_c", "gene_reaction_rule": "G1"}, + {"id": "R2", "equation": "a_c --> b_c"}, + ], + ) + return m + + +def test_replace_rule_and_create_genes(model): + (rxn,) = change_gene_reaction_rules(model, {"R1": "G2 and G3"}) + assert rxn.gene_reaction_rule == "G2 and G3" + assert {g.id for g in rxn.genes} == {"G2", "G3"} + assert {"G2", "G3"} <= {g.id for g in model.genes} + + +def test_append_rule(model): + change_gene_reaction_rules(model, {"R1": "G4"}, replace=False) + # (G1) or (G4), normalised by cobra + assert model.reactions.get_by_id("R1").gene_reaction_rule == "G1 or G4" + + +def test_append_when_empty_is_just_new(model): + change_gene_reaction_rules(model, {"R2": "G5"}, replace=False) + assert model.reactions.get_by_id("R2").gene_reaction_rule == "G5" + + +def test_batch(model): + changed = change_gene_reaction_rules(model, {"R1": "GA", "R2": "GB"}) + assert [r.id for r in changed] == ["R1", "R2"] + + +def test_unknown_reaction_errors(model): + with pytest.raises(ValueError, match="not found"): + change_gene_reaction_rules(model, {"NOPE": "G1"}) diff --git a/tests/test_manipulation_add.py b/tests/test_manipulation_add.py new file mode 100644 index 0000000..2a3a9d3 --- /dev/null +++ b/tests/test_manipulation_add.py @@ -0,0 +1,278 @@ +"""Tests for raven_python.manipulation.add (addRxns port).""" +import cobra +import pytest + +from raven_python.manipulation import add_reactions_from_equations +from raven_python.utils.parse import parse_name_comp + + +@pytest.fixture +def model(): + m = cobra.Model("t") + m.add_metabolites( + [ + cobra.Metabolite("atp_c", name="ATP", compartment="c"), + cobra.Metabolite("h2o_c", name="H2O", compartment="c"), + cobra.Metabolite("adp_c", name="ADP", compartment="c"), + cobra.Metabolite("pi_c", name="phosphate", compartment="c"), + ] + ) + return m + + +# --- parse_name_comp ------------------------------------------------------- + +@pytest.mark.parametrize( + "token,expected", + [ + ("ATP[c]", ("ATP", "c")), + ("ATP", ("ATP", None)), + (" ATP[c] ", ("ATP", "c")), + ("weird[name][m]", ("weird[name]", "m")), + ], +) +def test_parse_name_comp(token, expected): + assert parse_name_comp(token) == expected + + +# --- id mode (eqnType 1) --------------------------------------------------- + +def test_add_by_id_basic_and_reversibility(model): + (rxn,) = add_reactions_from_equations( + model, [{"id": "R1", "equation": "atp_c + h2o_c <=> adp_c + pi_c"}] + ) + assert rxn.id == "R1" + assert rxn.reversibility is True + assert {m.id: rxn.get_coefficient(m.id) for m in rxn.metabolites} == { + "atp_c": -1.0, + "h2o_c": -1.0, + "adp_c": 1.0, + "pi_c": 1.0, + } + + +def test_irreversible_arrows(model): + rxns = add_reactions_from_equations( + model, + [ + {"id": "R1", "equation": "atp_c --> adp_c"}, + {"id": "R2", "equation": "atp_c => adp_c"}, + ], + ) + for r in rxns: + assert r.lower_bound == 0.0 + assert r.reversibility is False + + +def test_coefficients(model): + (rxn,) = add_reactions_from_equations( + model, [{"id": "R1", "equation": "2 atp_c + 1.5 h2o_c --> adp_c"}] + ) + assert rxn.get_coefficient("atp_c") == -2.0 + assert rxn.get_coefficient("h2o_c") == -1.5 + + +def test_id_mode_creates_new_met_in_compartment(model): + add_reactions_from_equations( + model, + [{"id": "R1", "equation": "atp_c --> amp_c"}], + compartment="c", + ) + assert "amp_c" in model.metabolites + assert model.metabolites.get_by_id("amp_c").compartment == "c" + + +def test_id_mode_new_met_without_compartment_errors(model): + with pytest.raises(ValueError, match="no compartment"): + add_reactions_from_equations(model, [{"id": "R1", "equation": "atp_c --> amp_c"}]) + + +# --- name mode (eqnType 2) ------------------------------------------------- + +def test_name_mode_matches_existing_by_name(model): + (rxn,) = add_reactions_from_equations( + model, + [{"id": "R1", "equation": "ATP + H2O <=> ADP + phosphate"}], + mets_by="name", + compartment="c", + ) + # resolved to the existing _c metabolites, not new ones + assert {m.id for m in rxn.metabolites} == {"atp_c", "h2o_c", "adp_c", "pi_c"} + assert len(model.metabolites) == 4 + + +def test_name_mode_creates_new_met_with_auto_id(model): + add_reactions_from_equations( + model, + [{"id": "R1", "equation": "ATP --> AMP"}], + mets_by="name", + compartment="c", + ) + new = [m for m in model.metabolites if m.name == "AMP"] + assert len(new) == 1 + assert new[0].id == "m1" + assert new[0].compartment == "c" + + +def test_name_mode_requires_compartment(model): + with pytest.raises(ValueError, match="needs a compartment"): + add_reactions_from_equations( + model, [{"id": "R1", "equation": "ATP --> ADP"}], mets_by="name" + ) + + +# --- name[comp] mode (eqnType 3) ------------------------------------------- + +def test_name_comp_syntax(model): + model.add_metabolites([cobra.Metabolite("atp_m", name="ATP", compartment="m")]) + (rxn,) = add_reactions_from_equations( + model, + [{"id": "R1", "equation": "ATP[c] --> ATP[m]"}], + mets_by="name", + compartment="c", + ) + # matched ATP in two different compartments by name[comp] + assert {m.id for m in rxn.metabolites} == {"atp_c", "atp_m"} + + +# --- genes ----------------------------------------------------------------- + +def test_gene_rule_auto_creates_genes(model): + (rxn,) = add_reactions_from_equations( + model, + [{"id": "R1", "equation": "atp_c --> adp_c", "gene_reaction_rule": "G1 and G2"}], + ) + assert {g.id for g in rxn.genes} == {"G1", "G2"} + assert {g.id for g in model.genes} == {"G1", "G2"} + + +def test_strict_genes_errors_on_unknown(model): + with pytest.raises(ValueError, match="genes not in the model"): + add_reactions_from_equations( + model, + [{"id": "R1", "equation": "atp_c --> adp_c", "gene_reaction_rule": "G1"}], + allow_new_genes=False, + ) + + +def test_strict_genes_ok_when_present(model): + model.genes.append(cobra.core.gene.Gene("G1")) + (rxn,) = add_reactions_from_equations( + model, + [{"id": "R1", "equation": "atp_c --> adp_c", "gene_reaction_rule": "G1"}], + allow_new_genes=False, + ) + assert rxn.gene_reaction_rule == "G1" + + +# --- guards & extras ------------------------------------------------------- + +def test_duplicate_reaction_id_errors(model): + model.add_reactions([cobra.Reaction("R1")]) + with pytest.raises(ValueError, match="already exists"): + add_reactions_from_equations(model, [{"id": "R1", "equation": "atp_c --> adp_c"}]) + + +def test_strict_mets_errors(model): + with pytest.raises(ValueError, match="allow_new_mets"): + add_reactions_from_equations( + model, + [{"id": "R1", "equation": "atp_c --> amp_c"}], + compartment="c", + allow_new_mets=False, + ) + + +def test_explicit_bounds_override_arrow(model): + (rxn,) = add_reactions_from_equations( + model, + [{"id": "R1", "equation": "atp_c <=> adp_c", "bounds": (0, 50), "name": "myrxn"}], + ) + assert rxn.bounds == (0, 50) + assert rxn.name == "myrxn" + + +def test_net_zero_metabolite_dropped(model): + # atp_c on both sides nets to zero and is removed. + (rxn,) = add_reactions_from_equations( + model, [{"id": "R1", "equation": "atp_c + h2o_c --> atp_c + adp_c"}] + ) + assert "atp_c" not in {m.id for m in rxn.metabolites} + assert {m.id for m in rxn.metabolites} == {"h2o_c", "adp_c"} + + +def test_missing_equation_errors(model): + with pytest.raises(ValueError, match="missing required 'equation'"): + add_reactions_from_equations(model, [{"id": "R1"}]) + + +def test_no_arrow_errors(model): + with pytest.raises(ValueError, match="No reaction arrow"): + add_reactions_from_equations(model, [{"id": "R1", "equation": "atp_c + h2o_c"}]) + + +# --- regression: leading-number metabolite name (known_issues.md A1) ------- + +def test_name_mode_preserves_leading_number_name(model): + """A metabolite name that begins with a number isn't misparsed as a coefficient. + + Before the fix the token ``"2 oxoglutarate"`` was parsed as ``(coeff=2, name="oxoglutarate")`` + silently — corrupting the stoichiometry. The resolver now prefers the full + token when it matches an existing metabolite name. + """ + model.add_metabolites([ + cobra.Metabolite("akg_c", name="2 oxoglutarate", compartment="c"), + ]) + (rxn,) = add_reactions_from_equations( + model, + [{"id": "R1", "equation": "ATP + 2 oxoglutarate --> ADP"}], + mets_by="name", + compartment="c", + ) + assert rxn.get_coefficient("akg_c") == -1.0 # not -2.0 + assert rxn.get_coefficient("atp_c") == -1.0 + + +def test_name_mode_coefficient_still_works_without_collision(model): + """If the full token doesn't match anything, fall back to coefficient split.""" + (rxn,) = add_reactions_from_equations( + model, + [{"id": "R1", "equation": "2 ATP + H2O --> ADP + phosphate"}], + mets_by="name", + compartment="c", + ) + assert rxn.get_coefficient("atp_c") == -2.0 + + +# --- regression: empty-stoichiometry warning (known_issues.md A2) ---------- + +def test_empty_stoichiometry_warns(model): + """All-terms-cancel reaction warns instead of silently shipping an empty rxn.""" + with pytest.warns(UserWarning, match="no net metabolites"): + (rxn,) = add_reactions_from_equations( + model, [{"id": "R1", "equation": "atp_c --> atp_c"}] + ) + assert len(rxn.metabolites) == 0 + + +# --- regression: unknown-compartment warning (known_issues.md B2) ---------- + +def test_id_mode_unknown_compartment_warns(model): + """A typo'd compartment used to silently produce a one-met ghost compartment + in id mode (the name/[comp] path used to validate, id mode never did).""" + with pytest.warns(UserWarning, match="unregistered compartment 'cyto'"): + add_reactions_from_equations( + model, + [{"id": "R1", "equation": "atp_c --> amp_c"}], + compartment="cyto", # typo for 'c' + ) + + +def test_name_comp_unknown_compartment_warns(model): + """Same defensive check in the name[comp] path when allow_new_mets=True.""" + with pytest.warns(UserWarning, match="unregistered compartment 'mito'"): + add_reactions_from_equations( + model, + [{"id": "R1", "equation": "ATP[c] --> AMP[mito]"}], + mets_by="name", + ) diff --git a/tests/test_manipulation_change.py b/tests/test_manipulation_change.py new file mode 100644 index 0000000..8d54f58 --- /dev/null +++ b/tests/test_manipulation_change.py @@ -0,0 +1,93 @@ +"""Tests for raven_python.manipulation.change (changeRxns port).""" +import cobra +import pytest + +from raven_python.manipulation import add_reactions_from_equations, change_reaction_equations + + +@pytest.fixture +def model(): + m = cobra.Model("t") + m.add_metabolites( + [ + cobra.Metabolite("a_c", name="A", compartment="c"), + cobra.Metabolite("b_c", name="B", compartment="c"), + cobra.Metabolite("c_c", name="C", compartment="c"), + ] + ) + add_reactions_from_equations( + m, + [ + { + "id": "R1", + "equation": "a_c <=> b_c", + "name": "first", + "bounds": (-30, 70), + "gene_reaction_rule": "G1 or G2", + "subsystem": "sub", + }, + {"id": "R2", "equation": "a_c --> c_c"}, + ], + ) + return m + + +def test_changes_stoichiometry(model): + (rxn,) = change_reaction_equations(model, {"R1": "a_c --> 2 c_c"}) + assert rxn.id == "R1" + assert {m.id: rxn.get_coefficient(m.id) for m in rxn.metabolites} == { + "a_c": -1.0, + "c_c": 2.0, + } + + +def test_preserves_other_fields(model): + before = model.reactions.get_by_id("R1") + name, bounds, subsystem = before.name, before.bounds, before.subsystem + genes = {g.id for g in before.genes} + + change_reaction_equations(model, {"R1": "a_c --> c_c"}) + + after = model.reactions.get_by_id("R1") + assert after.name == name + assert after.bounds == bounds # bounds untouched, per RAVEN + assert after.subsystem == subsystem + assert {g.id for g in after.genes} == genes + + +def test_preserves_reaction_order(model): + order_before = [r.id for r in model.reactions] + change_reaction_equations(model, {"R1": "b_c --> c_c"}) + assert [r.id for r in model.reactions] == order_before + + +def test_bounds_not_changed_by_arrow(model): + # R1 starts reversible (-30, 70); a --> arrow must NOT make it irreversible. + change_reaction_equations(model, {"R1": "a_c --> b_c"}) + assert model.reactions.get_by_id("R1").bounds == (-30, 70) + + +def test_name_mode(model): + (rxn,) = change_reaction_equations( + model, {"R2": "A --> C"}, mets_by="name", compartment="c" + ) + assert {m.id for m in rxn.metabolites} == {"a_c", "c_c"} + + +def test_can_introduce_new_met(model): + change_reaction_equations( + model, {"R2": "a_c --> d_c"}, compartment="c" + ) + assert "d_c" in model.metabolites + assert model.reactions.get_by_id("R2").get_coefficient("d_c") == 1.0 + + +def test_unknown_reaction_errors(model): + with pytest.raises(ValueError, match="not found"): + change_reaction_equations(model, {"NOPE": "a_c --> b_c"}) + + +def test_multiple_reactions(model): + changed = change_reaction_equations(model, {"R1": "a_c --> c_c", "R2": "b_c --> c_c"}) + assert [r.id for r in changed] == ["R1", "R2"] + assert model.reactions.get_by_id("R2").get_coefficient("b_c") == -1.0 diff --git a/tests/test_manipulation_compartments.py b/tests/test_manipulation_compartments.py new file mode 100644 index 0000000..4d3fb3b --- /dev/null +++ b/tests/test_manipulation_compartments.py @@ -0,0 +1,139 @@ +"""Tests for manipulation/compartments.py — merge_compartments + copy_to_compartment.""" +from __future__ import annotations + +import cobra +import pytest + +from raven_python.manipulation.compartments import copy_to_compartment, merge_compartments + + +def _two_compartment_model() -> cobra.Model: + """A_c → B_c, A_m → B_m, and a transport A_c ↔ A_m. Multi-compartment toy.""" + m = cobra.Model("toy") + A_c = cobra.Metabolite("A_c", name="A", compartment="c") + A_m = cobra.Metabolite("A_m", name="A", compartment="m") + B_c = cobra.Metabolite("B_c", name="B", compartment="c") + B_m = cobra.Metabolite("B_m", name="B", compartment="m") + m.add_metabolites([A_c, A_m, B_c, B_m]) + + def rxn(rid, lb, ub, mets, gpr=None): + r = cobra.Reaction(rid, lower_bound=lb, upper_bound=ub) + r.add_metabolites(mets) + if gpr: + r.gene_reaction_rule = gpr + return r + m.add_reactions([rxn("r_c", 0, 1000, {A_c: -1, B_c: 1}, "g1"), + rxn("r_m", 0, 1000, {A_m: -1, B_m: 1}, "g2"), + rxn("tr_A", -1000, 1000, {A_c: -1, A_m: 1})]) + return m + + +# ----------------------------------------------------------------- merge_compartments + +def test_merge_compartments_collapses_to_one(): + """A_c + A_m → A; B_c + B_m → B; transport A_c↔A_m self-cancels and is dropped.""" + m = _two_compartment_model() + merged, deleted, dupes = merge_compartments(m) + # Only the base ids survive. + assert {x.id for x in merged.metabolites} == {"A", "B"} + # The transport reaction collapsed (A → A) and was deleted. + assert "tr_A" in deleted + # r_c and r_m are now both A → B; one of them gets deduplicated. + surviving = {r.id for r in merged.reactions} + assert len(surviving & {"r_c", "r_m"}) == 1 + assert (set(dupes) | (surviving & {"r_c", "r_m"})) == {"r_c", "r_m"} + + +def test_merge_compartments_preserves_gpr_and_subsystem(): + m = _two_compartment_model() + m.reactions.r_c.subsystem = "carbo" + merged, _, _ = merge_compartments(m) + survivor = next(r for r in merged.reactions if r.id in {"r_c", "r_m"}) + # The survivor keeps its gene rule + subsystem (cobra may sometimes lose them + # through copy; we set them explicitly). + assert survivor.gene_reaction_rule in {"g1", "g2"} + if survivor.id == "r_c": + assert survivor.subsystem == "carbo" + + +def test_merge_compartments_keeps_single_met_reactions_when_asked(): + """drop_single_metabolite_reactions=False keeps the collapsed transport (now A → A, + which is empty stoichiometry after net-cancellation — still dropped, but the *one-met* + case is the more interesting one). Use a uniport pattern to exercise it.""" + m = cobra.Model("uniport") + A_c = cobra.Metabolite("A_c", name="A", compartment="c") + A_m = cobra.Metabolite("A_m", name="A", compartment="m") + H_c = cobra.Metabolite("H_c", name="H", compartment="c") + m.add_metabolites([A_c, A_m, H_c]) + # H+ symport: A_c + H_c → A_m. After merge: A + H → A → leaves H. + sym = cobra.Reaction("sym", lower_bound=0, upper_bound=1000) + sym.add_metabolites({A_c: -1, H_c: -1, A_m: 1}) + m.add_reactions([sym]) + merged_drop, deleted_drop, _ = merge_compartments(m, drop_single_metabolite_reactions=True) + assert "sym" in deleted_drop + merged_keep, deleted_keep, _ = merge_compartments(m, drop_single_metabolite_reactions=False) + # With keep, sym survives as a one-met reaction (consumes H). + assert "sym" not in deleted_keep + assert "sym" in {r.id for r in merged_keep.reactions} + + +def test_merge_compartments_deduplicate_off_keeps_both(): + m = _two_compartment_model() + merged, _, dupes = merge_compartments(m, deduplicate_reactions=False) + assert dupes == [] + assert {"r_c", "r_m"} <= {r.id for r in merged.reactions} + + +# ----------------------------------------------------------------- copy_to_compartment + +def test_copy_to_compartment_basic(): + """Copy r_c into 'p' (peroxisome): a new reaction r_c_p with metabolites in p.""" + m = _two_compartment_model() + out, new_rxns, new_mets = copy_to_compartment(m, ["r_c"], "p", + target_compartment_name="peroxisome") + assert "r_c_p" in [r.id for r in out.reactions] + new_r = out.reactions.r_c_p + assert {x.compartment for x in new_r.metabolites} == {"p"} + assert "A_p" in [x.id for x in out.metabolites] + assert "B_p" in [x.id for x in out.metabolites] + assert new_rxns == ["r_c_p"] + assert set(new_mets) == {"A_p", "B_p"} + # Original still there. + assert "r_c" in [r.id for r in out.reactions] + + +def test_copy_to_compartment_preserves_gpr_and_bounds(): + m = _two_compartment_model() + out, _, _ = copy_to_compartment(m, ["r_c"], "p") + new_r = out.reactions.r_c_p + assert new_r.gene_reaction_rule == "g1" + assert new_r.lower_bound == 0 and new_r.upper_bound == 1000 + + +def test_copy_to_compartment_delete_original_is_a_move(): + m = _two_compartment_model() + out, _, _ = copy_to_compartment(m, ["r_c"], "p", delete_original=True) + assert "r_c" not in [r.id for r in out.reactions] + assert "r_c_p" in [r.id for r in out.reactions] + + +def test_copy_to_compartment_idempotent(): + """Calling twice doesn't add the reaction twice.""" + m = _two_compartment_model() + out, _, _ = copy_to_compartment(m, ["r_c"], "p") + out2, new_rxns, _ = copy_to_compartment(out, ["r_c"], "p") + assert new_rxns == [] # nothing added on second call + assert len([r for r in out2.reactions if r.id == "r_c_p"]) == 1 + + +def test_copy_to_compartment_unknown_reaction_raises(): + m = _two_compartment_model() + with pytest.raises(ValueError, match="not in model"): + copy_to_compartment(m, ["does_not_exist"], "p") + + +def test_copy_to_compartment_custom_suffix(): + m = _two_compartment_model() + out, new_rxns, _ = copy_to_compartment(m, ["r_c"], "p", id_suffix="copy1") + assert new_rxns == ["r_c_copy1"] + assert "A_copy1" in [x.id for x in out.metabolites] diff --git a/tests/test_manipulation_expand.py b/tests/test_manipulation_expand.py new file mode 100644 index 0000000..08cd2f2 --- /dev/null +++ b/tests/test_manipulation_expand.py @@ -0,0 +1,288 @@ +"""Tests for expand_model (RAVEN expandModel.m) — splitting isozymes into reactions. + +Adopted from geckopy's tests/test_expand.py. +""" +import cobra + +from raven_python.manipulation import expand_model +from raven_python.manipulation.expand import _gpr_to_dnf + +# --------------------------------------------------------------------------- # +# DNF conversion (internal helper, worth testing directly) +# --------------------------------------------------------------------------- # + +def _dnf_from_gpr_string(gpr_str: str) -> list[list[str]]: + from cobra.core.gene import GPR + + gpr = GPR.from_string(gpr_str) + return _gpr_to_dnf(gpr) + + +def test_dnf_empty_gpr(): + assert _dnf_from_gpr_string("") == [] + + +def test_dnf_single_gene(): + assert _dnf_from_gpr_string("g1") == [["g1"]] + + +def test_dnf_simple_and(): + assert _dnf_from_gpr_string("g1 and g2") == [["g1", "g2"]] + + +def test_dnf_simple_or(): + assert _dnf_from_gpr_string("g1 or g2") == [["g1"], ["g2"]] + + +def test_dnf_or_of_ands(): + assert _dnf_from_gpr_string("(g1 and g2) or (g3 and g4)") == [ + ["g1", "g2"], + ["g3", "g4"], + ] + + +def test_dnf_distributes_and_over_or(): + result = _dnf_from_gpr_string("g1 and (g2 or g3)") + assert result == [["g1", "g2"], ["g1", "g3"]] + + +def test_dnf_triple_or(): + assert _dnf_from_gpr_string("g1 or g2 or g3") == [ + ["g1"], ["g2"], ["g3"], + ] + + +def test_dnf_preserves_gene_order_within_clause(): + result = _dnf_from_gpr_string("g3 and g1 and g2") + assert result == [["g3", "g1", "g2"]] + + +# --------------------------------------------------------------------------- # +# expand_model +# --------------------------------------------------------------------------- # + +def _build_model( + reactions: list[tuple[str, dict[str, float], float, float, str]], +) -> cobra.Model: + """Build from (rxn_id, {met_id: coef}, lb, ub, gpr) tuples.""" + model = cobra.Model("test") + mets: dict[str, cobra.Metabolite] = {} + for _, stoich, _, _, _ in reactions: + for met_id in stoich: + if met_id not in mets: + mets[met_id] = cobra.Metabolite(met_id, compartment="c") + + for rxn_id, stoich, lb, ub, gpr in reactions: + rxn = cobra.Reaction(rxn_id) + rxn.lower_bound = lb + rxn.upper_bound = ub + rxn.add_metabolites({mets[m]: c for m, c in stoich.items()}) + if gpr: + rxn.gene_reaction_rule = gpr + model.add_reactions([rxn]) + return model + + +def test_does_not_expand_reaction_without_gpr(): + model = _build_model([("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "")]) + added = expand_model(model) + assert added == [] + assert "r1" in {r.id for r in model.reactions} + + +def test_does_not_expand_single_and_clause(): + model = _build_model([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 and g2"), + ]) + added = expand_model(model) + assert added == [] + r1 = model.reactions.get_by_id("r1") + assert r1.gene_reaction_rule == "g1 and g2" + + +def test_does_not_expand_single_gene(): + model = _build_model([("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1")]) + added = expand_model(model) + assert added == [] + assert model.reactions.get_by_id("r1").gene_reaction_rule == "g1" + + +def test_splits_simple_or_into_two_reactions(): + model = _build_model([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"), + ]) + added = expand_model(model) + + assert added == ["r1_EXP_1", "r1_EXP_2"] + rxn_ids = {r.id for r in model.reactions} + assert "r1" not in rxn_ids + assert "r1_EXP_1" in rxn_ids + assert "r1_EXP_2" in rxn_ids + + assert model.reactions.get_by_id("r1_EXP_1").gene_reaction_rule == "g1" + assert model.reactions.get_by_id("r1_EXP_2").gene_reaction_rule == "g2" + + +def test_splits_or_of_ands(): + model = _build_model([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, + "(g1 and g2) or (g3 and g4)"), + ]) + added = expand_model(model) + + assert added == ["r1_EXP_1", "r1_EXP_2"] + assert model.reactions.get_by_id("r1_EXP_1").gene_reaction_rule == "g1 and g2" + assert model.reactions.get_by_id("r1_EXP_2").gene_reaction_rule == "g3 and g4" + + +def test_distributes_and_over_or(): + model = _build_model([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, + "g1 and (g2 or g3)"), + ]) + added = expand_model(model) + + assert added == ["r1_EXP_1", "r1_EXP_2"] + assert model.reactions.get_by_id("r1_EXP_1").gene_reaction_rule == "g1 and g2" + assert model.reactions.get_by_id("r1_EXP_2").gene_reaction_rule == "g1 and g3" + + +def test_expanded_reactions_inherit_stoichiometry_and_bounds(): + model = _build_model([ + ("r1", {"A": -1.0, "B": 2.0}, -500.0, 1500.0, "g1 or g2"), + ]) + expand_model(model) + + for suffix in ("_EXP_1", "_EXP_2"): + rxn = model.reactions.get_by_id(f"r1{suffix}") + assert rxn.bounds == (-500.0, 1500.0) + stoich = {m.id: c for m, c in rxn.metabolites.items()} + assert stoich == {"A": -1.0, "B": 2.0} + + +def test_expanded_reactions_inherit_name_and_subsystem(): + model = _build_model([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"), + ]) + r1 = model.reactions.get_by_id("r1") + r1.name = "an isozyme-catalyzed reaction" + r1.subsystem = "central metabolism" + + expand_model(model) + + for suffix in ("_EXP_1", "_EXP_2"): + rxn = model.reactions.get_by_id(f"r1{suffix}") + assert rxn.name == "an isozyme-catalyzed reaction" + assert rxn.subsystem == "central metabolism" + + +def test_multiple_reactions_expand_independently(): + model = _build_model([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"), + ("r2", {"B": -1.0, "C": 1.0}, 0.0, 1000.0, "g3 and g4"), + ("r3", {"C": -1.0, "D": 1.0}, 0.0, 1000.0, + "(g5 and g6) or g7 or (g8 and g9)"), + ]) + added = expand_model(model) + + assert added == sorted([ + "r1_EXP_1", "r1_EXP_2", + "r3_EXP_1", "r3_EXP_2", "r3_EXP_3", + ]) + + rxn_ids = {r.id for r in model.reactions} + assert "r2" in rxn_ids + assert "r1" not in rxn_ids + assert "r3" not in rxn_ids + + assert model.reactions.get_by_id("r2").gene_reaction_rule == "g3 and g4" + assert model.reactions.get_by_id("r3_EXP_1").gene_reaction_rule == "g5 and g6" + assert model.reactions.get_by_id("r3_EXP_2").gene_reaction_rule == "g7" + assert model.reactions.get_by_id("r3_EXP_3").gene_reaction_rule == "g8 and g9" + + +def test_expanded_reaction_has_correct_gene_set(): + model = _build_model([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, + "(g1 and g2) or (g3 and g4)"), + ]) + expand_model(model) + + r1_1 = model.reactions.get_by_id("r1_EXP_1") + assert {g.id for g in r1_1.genes} == {"g1", "g2"} + + r1_2 = model.reactions.get_by_id("r1_EXP_2") + assert {g.id for g in r1_2.genes} == {"g3", "g4"} + + +def test_expansion_is_idempotent_in_the_no_op_sense(): + model = _build_model([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"), + ("r2", {"B": -1.0, "C": 1.0}, 0.0, 1000.0, "g3 and g4"), + ]) + expand_model(model) + ids_before = {r.id for r in model.reactions} + + second = expand_model(model) + assert second == [] + + ids_after = {r.id for r in model.reactions} + assert ids_after == ids_before + + +def test_empty_model_is_unchanged(): + model = cobra.Model("empty") + assert expand_model(model) == [] + + +# --------------------------------------------------------------------------- # +# Annotation and notes propagation +# --------------------------------------------------------------------------- # + +def test_expanded_reactions_inherit_annotation_and_notes(): + model = _build_model([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"), + ]) + r1 = model.reactions.get_by_id("r1") + r1.annotation["ec-code"] = "1.2.3.4" + r1.annotation["sbo"] = "SBO:0000176" + r1.notes["custom"] = "hello" + + expand_model(model) + + for suffix in ("_EXP_1", "_EXP_2"): + rxn = model.reactions.get_by_id(f"r1{suffix}") + assert rxn.annotation["ec-code"] == "1.2.3.4" + assert rxn.annotation["sbo"] == "SBO:0000176" + assert rxn.notes["custom"] == "hello" + + +def test_expanded_reaction_annotation_is_independent_of_parent(): + """Mutating one expanded reaction's annotation must not affect siblings.""" + model = _build_model([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0, "g1 or g2"), + ]) + model.reactions.get_by_id("r1").annotation["ec-code"] = ["1.2.3.4"] + + expand_model(model) + + r1_1 = model.reactions.get_by_id("r1_EXP_1") + r1_2 = model.reactions.get_by_id("r1_EXP_2") + r1_1.annotation["ec-code"].append("9.9.9.9") + assert r1_2.annotation["ec-code"] == ["1.2.3.4"] + + +def test_objective_coefficient_preserved_on_expansion(): + """An expanded reaction's isozyme copies retain the original objective coefficient.""" + m = cobra.Model("o") + a, b = (cobra.Metabolite(x, compartment="c") for x in "ab") + m.add_metabolites([a, b]) + r = cobra.Reaction("r1", lower_bound=0, upper_bound=1000) + r.add_metabolites({a: -1, b: 1}) + r.gene_reaction_rule = "g1 or g2" + m.add_reactions([r]) + m.objective = "r1" # objective on the soon-to-be-expanded reaction + + expand_model(m) + coeffs = {rx.id: rx.objective_coefficient for rx in m.reactions} + assert coeffs == {"r1_EXP_1": 1.0, "r1_EXP_2": 1.0} # objective survives on both copies diff --git a/tests/test_manipulation_irreversible.py b/tests/test_manipulation_irreversible.py new file mode 100644 index 0000000..e211fa3 --- /dev/null +++ b/tests/test_manipulation_irreversible.py @@ -0,0 +1,144 @@ +"""Tests for convert_to_irreversible (RAVEN convertToIrrev.m). + +Adopted from geckopy's tests/test_preprocess.py (the convert_to_irreversible subset). +Exchange reactions are excluded from the split, matching MATLAB behavior. +""" +import cobra + +from raven_python.manipulation import convert_to_irreversible + + +def _build_model_with_bounds( + reactions: list[tuple[str, dict[str, float], float, float]], +) -> cobra.Model: + """Build from (rxn_id, {met_id: coef}, lb, ub) tuples.""" + model = cobra.Model("test") + mets: dict[str, cobra.Metabolite] = {} + for _, stoich, _, _ in reactions: + for met_id in stoich: + if met_id not in mets: + mets[met_id] = cobra.Metabolite(met_id, compartment="c") + + for rxn_id, stoich, lb, ub in reactions: + rxn = cobra.Reaction(rxn_id) + rxn.lower_bound = lb + rxn.upper_bound = ub + rxn.add_metabolites({mets[m]: c for m, c in stoich.items()}) + model.add_reactions([rxn]) + return model + + +def test_splits_single_reversible_non_exchange(): + model = _build_model_with_bounds([ + ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0), + ]) + + added = convert_to_irreversible(model) + assert added == ["r1_REV"] + + fwd = model.reactions.get_by_id("r1") + rev = model.reactions.get_by_id("r1_REV") + + assert fwd.bounds == (0.0, 1000.0) + assert {m.id: c for m, c in fwd.metabolites.items()} == {"A": -1.0, "B": 1.0} + + assert rev.bounds == (0.0, 500.0) + assert {m.id: c for m, c in rev.metabolites.items()} == {"A": 1.0, "B": -1.0} + + +def test_does_not_split_forward_only_reaction(): + model = _build_model_with_bounds([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0), + ]) + added = convert_to_irreversible(model) + assert added == [] + assert "r1_REV" not in {r.id for r in model.reactions} + + +def test_does_not_split_exchange_reaction_even_if_reversible(): + """Exchange reactions (one metabolite) are explicitly excluded from + the irreversibility step in MATLAB, regardless of bounds.""" + model = _build_model_with_bounds([ + ("EX_A", {"A": -1.0}, -1000.0, 1000.0), + ]) + added = convert_to_irreversible(model) + assert added == [] + ex = model.reactions.get_by_id("EX_A") + assert ex.bounds == (-1000.0, 1000.0) + + +def test_splits_multiple_mixed_reactions(): + model = _build_model_with_bounds([ + ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0), # split + ("r2", {"B": -2.0, "C": 3.0}, 0.0, 1000.0), # forward only + ("EX_A", {"A": -1.0}, -1000.0, 1000.0), # exchange + ("r3", {"C": -1.0, "D": 1.0}, -200.0, 200.0), # split + ]) + + added = convert_to_irreversible(model) + assert added == ["r1_REV", "r3_REV"] + + assert model.reactions.get_by_id("r1").bounds == (0.0, 1000.0) + assert model.reactions.get_by_id("r1_REV").bounds == (0.0, 500.0) + assert model.reactions.get_by_id("r2").bounds == (0.0, 1000.0) + assert model.reactions.get_by_id("EX_A").bounds == (-1000.0, 1000.0) + assert model.reactions.get_by_id("r3").bounds == (0.0, 200.0) + assert model.reactions.get_by_id("r3_REV").bounds == (0.0, 200.0) + + +def test_reverse_reaction_inherits_gpr(): + model = _build_model_with_bounds([ + ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0), + ]) + model.reactions.get_by_id("r1").gene_reaction_rule = "g1 and g2" + + convert_to_irreversible(model) + + rev = model.reactions.get_by_id("r1_REV") + assert rev.gene_reaction_rule == "g1 and g2" + assert {g.id for g in rev.genes} == {"g1", "g2"} + + +def test_forward_reaction_lb_is_clamped_to_zero(): + """After splitting, the original reaction should have lb = 0, + which is what MATLAB's convertToIrrev does.""" + model = _build_model_with_bounds([ + ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0), + ]) + convert_to_irreversible(model) + assert model.reactions.get_by_id("r1").lower_bound == 0.0 + + +def test_no_reverse_reaction_has_negative_bound(): + """After conversion, no non-exchange reaction may carry negative flux.""" + model = _build_model_with_bounds([ + ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0), + ("r2", {"B": -1.0, "C": 1.0}, -1000.0, 0.0), # blocked reverse + ("EX_A", {"A": -1.0}, -1000.0, 1000.0), + ]) + convert_to_irreversible(model) + for rxn in model.reactions: + if rxn.boundary: + continue + assert rxn.lower_bound >= 0, f"{rxn.id} still has lb < 0" + + +def test_returns_empty_list_when_nothing_to_split(): + model = _build_model_with_bounds([ + ("r1", {"A": -1.0, "B": 1.0}, 0.0, 1000.0), + ("EX_A", {"A": -1.0}, -1000.0, 1000.0), + ]) + assert convert_to_irreversible(model) == [] + + +def test_conversion_is_idempotent_after_first_pass(): + """Running convert_to_irreversible twice should not create + `_REV_REV` reactions, because the first pass already clamped + all non-exchange lb to 0.""" + model = _build_model_with_bounds([ + ("r1", {"A": -1.0, "B": 1.0}, -500.0, 1000.0), + ]) + convert_to_irreversible(model) + second = convert_to_irreversible(model) + assert second == [] + assert "r1_REV_REV" not in {r.id for r in model.reactions} diff --git a/tests/test_manipulation_merge.py b/tests/test_manipulation_merge.py new file mode 100644 index 0000000..a430f6e --- /dev/null +++ b/tests/test_manipulation_merge.py @@ -0,0 +1,136 @@ +"""Tests for merge_models (mergeModels port).""" +import cobra +import pytest + +from raven_python.manipulation import add_reactions_from_equations, merge_models + + +def _model(mid, mets, reactions): + m = cobra.Model(mid) + m.add_metabolites(mets) + add_reactions_from_equations(m, reactions) + return m + + +@pytest.fixture +def model_a(): + return _model( + "A", + [ + cobra.Metabolite("glc_c", name="Glucose", compartment="c"), + cobra.Metabolite("g6p_c", name="G6P", compartment="c"), + ], + [{"id": "HEX", "equation": "glc_c --> g6p_c", "gene_reaction_rule": "GA"}], + ) + + +@pytest.fixture +def model_b(): + # same Glucose[c] compound but a DIFFERENT id + return _model( + "B", + [ + cobra.Metabolite("glucose_c", name="Glucose", compartment="c"), + cobra.Metabolite("lac_c", name="Lactate", compartment="c"), + ], + [{"id": "LDH", "equation": "glucose_c --> lac_c", "gene_reaction_rule": "GB"}], + ) + + +def test_unifies_metabolites_by_name_comp(model_a, model_b): + merged = merge_models([model_a, model_b]) + glucoses = [m for m in merged.metabolites if m.name == "Glucose" and m.compartment == "c"] + assert len(glucoses) == 1 # glc_c and glucose_c unified + # both reactions reference the same merged Glucose object + hex_glc = [m for m in merged.reactions.get_by_id("HEX").metabolites if m.name == "Glucose"][0] + ldh_glc = [m for m in merged.reactions.get_by_id("LDH").metabolites if m.name == "Glucose"][0] + assert hex_glc is ldh_glc + + +def test_match_by_id_keeps_distinct(model_a, model_b): + merged = merge_models([model_a, model_b], match_by="id") + glucoses = [m for m in merged.metabolites if m.name == "Glucose"] + assert len(glucoses) == 2 # glc_c and glucose_c are distinct by id + + +def test_all_reactions_kept(model_a, model_b): + merged = merge_models([model_a, model_b]) + assert {"HEX", "LDH"} <= {r.id for r in merged.reactions} + + +def test_reaction_id_collision_renamed(model_a): + # two models with the same reaction id but different chemistry + other = _model( + "B", + [cobra.Metabolite("glc_c", name="Glucose", compartment="c"), + cobra.Metabolite("x_c", name="X", compartment="c")], + [{"id": "HEX", "equation": "glc_c --> x_c"}], + ) + merged = merge_models([model_a, other]) + assert "HEX" in {r.id for r in merged.reactions} + assert "HEX_B" in {r.id for r in merged.reactions} # renamed with source id + + +def test_genes_merged(model_a, model_b): + merged = merge_models([model_a, model_b]) + assert {"GA", "GB"} <= {g.id for g in merged.genes} + + +def test_provenance_recorded(model_a, model_b): + merged = merge_models([model_a, model_b]) + assert merged.reactions.get_by_id("HEX").notes["origin"] == "A" + assert merged.reactions.get_by_id("LDH").notes["origin"] == "B" + assert merged.genes.get_by_id("GA").notes["origin"] == "A" + + +def test_compartments_preserved(model_a): + model_a.compartments = {"c": "cytoplasm"} + merged = merge_models([model_a, model_a.copy()]) + assert merged.compartments.get("c") == "cytoplasm" + + +def test_single_model_returns_copy(model_a): + merged = merge_models([model_a]) + assert merged is not model_a + assert {r.id for r in merged.reactions} == {r.id for r in model_a.reactions} + + +def test_three_models(model_a, model_b): + c = _model("C", [cobra.Metabolite("co2_c", name="CO2", compartment="c")], + [{"id": "SINK", "equation": "co2_c -->"}]) + merged = merge_models([model_a, model_b, c]) + assert {"HEX", "LDH", "SINK"} <= {r.id for r in merged.reactions} + + +def test_bad_match_by(model_a, model_b): + with pytest.raises(ValueError, match="match_by"): + merge_models([model_a, model_b], match_by="oops") + + +# --- regression: formula/charge conflict (known_issues.md B1) -------------- + +def test_formula_conflict_warns(): + """Two models sharing a name[comp] but with different formulas warn instead + of silently keeping the first.""" + a = _model("A", + [cobra.Metabolite("g1", name="Glucose", formula="C6H12O6", compartment="c")], + [{"id": "EX_A", "equation": "g1 -->"}]) + b = _model("B", + [cobra.Metabolite("g2", name="Glucose", formula="C6H12O7", compartment="c")], + [{"id": "EX_B", "equation": "g2 -->"}]) + with pytest.warns(UserWarning, match="different formulas"): + merged = merge_models([a, b]) + # The merge still picks the first-seen — the test asserts the warning fired + # and the model survives. + assert "EX_A" in merged.reactions and "EX_B" in merged.reactions + + +def test_charge_conflict_warns(): + a = _model("A", + [cobra.Metabolite("g1", name="Glucose", formula="C6H12O6", charge=0, compartment="c")], + [{"id": "EX_A", "equation": "g1 -->"}]) + b = _model("B", + [cobra.Metabolite("g2", name="Glucose", formula="C6H12O6", charge=-1, compartment="c")], + [{"id": "EX_B", "equation": "g2 -->"}]) + with pytest.warns(UserWarning, match="different charges"): + merge_models([a, b]) diff --git a/tests/test_manipulation_remove.py b/tests/test_manipulation_remove.py new file mode 100644 index 0000000..2b659b9 --- /dev/null +++ b/tests/test_manipulation_remove.py @@ -0,0 +1,97 @@ +"""Tests for raven_python.manipulation.remove (removeMets/removeGenes ports).""" +import cobra +import pytest + +from raven_python.manipulation import ( + add_reactions_from_equations, + remove_genes, + remove_metabolites, +) + + +@pytest.fixture +def model(): + m = cobra.Model("t") + m.add_metabolites( + [ + cobra.Metabolite("atp_c", name="ATP", compartment="c"), + cobra.Metabolite("atp_m", name="ATP", compartment="m"), + cobra.Metabolite("adp_c", name="ADP", compartment="c"), + cobra.Metabolite("x_c", name="X", compartment="c"), + ] + ) + add_reactions_from_equations( + m, + [ + {"id": "R1", "equation": "atp_c --> adp_c", "gene_reaction_rule": "G1 and G2"}, + {"id": "R2", "equation": "atp_c --> x_c", "gene_reaction_rule": "G3 or G4"}, + {"id": "R3", "equation": "atp_m --> adp_c"}, # no GPR (spontaneous) + ], + ) + return m + + +# --- remove_metabolites ---------------------------------------------------- + +def test_remove_metabolites_by_id(model): + remove_metabolites(model, ["x_c"]) + assert "x_c" not in model.metabolites + # reaction kept, just lost the metabolite + assert "R2" in model.reactions + + +def test_remove_metabolites_by_name_across_compartments(model): + # "ATP" exists in c and m; by_name removes both at once. + remove_metabolites(model, ["ATP"], by_name=True) + assert "atp_c" not in model.metabolites + assert "atp_m" not in model.metabolites + assert "adp_c" in model.metabolites + + +def test_remove_metabolites_destructive(model): + remove_metabolites(model, ["adp_c"], destructive=True) + # R1 and R3 both produced adp_c -> removed + assert "adp_c" not in model.metabolites + assert "R1" not in model.reactions and "R3" not in model.reactions + + +# --- remove_genes ---------------------------------------------------------- + +def test_remove_genes_remove_mode(model): + blocked = remove_genes(model, ["G1"], blocked_reactions="remove") + # R1 = "G1 and G2": removing G1 breaks the complex -> blocked -> removed + assert blocked == ["R1"] + assert "R1" not in model.reactions + assert "R2" in model.reactions # OR rule unaffected + + +def test_remove_genes_constrain_mode(model): + blocked = remove_genes(model, ["G1"], blocked_reactions="constrain") + assert blocked == ["R1"] + r1 = model.reactions.get_by_id("R1") + assert r1.bounds == (0, 0) # kept but constrained, per RAVEN default + assert r1.gene_reaction_rule == "" + + +def test_remove_genes_keep_mode(model): + blocked = remove_genes(model, ["G1"], blocked_reactions="keep") + assert blocked == ["R1"] + r1 = model.reactions.get_by_id("R1") + assert r1.gene_reaction_rule == "" + assert r1.bounds != (0, 0) # left untouched + + +def test_remove_genes_or_rule_not_blocked(model): + blocked = remove_genes(model, ["G3"], blocked_reactions="remove") + # R2 = "G3 or G4": removing G3 leaves G4 -> not blocked + assert blocked == [] + assert model.reactions.get_by_id("R2").gene_reaction_rule == "G4" + + +def test_remove_genes_absent_gene_is_noop(model): + assert remove_genes(model, ["NOPE"]) == [] + + +def test_remove_genes_bad_policy(model): + with pytest.raises(ValueError, match="blocked_reactions"): + remove_genes(model, ["G1"], blocked_reactions="explode") diff --git a/tests/test_manipulation_simplify.py b/tests/test_manipulation_simplify.py new file mode 100644 index 0000000..586a0c3 --- /dev/null +++ b/tests/test_manipulation_simplify.py @@ -0,0 +1,184 @@ +"""Tests for simplifyModel reduction modes.""" +import cobra +import pytest + +from raven_python.manipulation import ( + add_reactions_from_equations, + constrain_reversible_reactions, + group_linear_reactions, + remove_dead_end_reactions, + remove_duplicate_reactions, +) + +# --- remove_dead_end_reactions -------------------------------------------- + +def test_dead_end_removed(): + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b", "dead")]) + add_reactions_from_equations( + m, + [ + {"id": "R_in", "equation": " --> a"}, + {"id": "R1", "equation": "a --> b"}, + {"id": "R_out", "equation": "b --> "}, + {"id": "R_dead", "equation": "a --> dead"}, # 'dead' only produced + ], + ) + removed_rxns, removed_mets = remove_dead_end_reactions(m) + assert "R_dead" in removed_rxns + assert "dead" in removed_mets + # the productive path survives + assert {"R_in", "R1", "R_out"} <= {r.id for r in m.reactions} + + +def test_dead_end_respects_reserved(): + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "dead")]) + add_reactions_from_equations( + m, [{"id": "R_in", "equation": " --> a"}, {"id": "R_dead", "equation": "a --> dead"}] + ) + removed_rxns, _ = remove_dead_end_reactions(m, reserved=["R_dead"]) + assert "R_dead" not in removed_rxns + assert "R_dead" in {r.id for r in m.reactions} + + +# --- remove_duplicate_reactions ------------------------------------------- + +def test_duplicates_removed(): + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b")]) + add_reactions_from_equations( + m, + [ + {"id": "R1", "equation": "a --> b", "bounds": (0, 1000)}, + {"id": "R2", "equation": "a --> b", "bounds": (0, 1000)}, # duplicate of R1 + {"id": "R3", "equation": "a --> b", "bounds": (0, 500)}, # different bounds + ], + ) + removed = remove_duplicate_reactions(m) + assert len(removed) == 1 # one of R1/R2 removed + assert {"R3"} <= {r.id for r in m.reactions} + assert sum(r.id in ("R1", "R2") for r in m.reactions) == 1 + + +def test_duplicates_keep_reserved(): + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b")]) + add_reactions_from_equations( + m, + [ + {"id": "R1", "equation": "a --> b", "bounds": (0, 1000)}, + {"id": "R2", "equation": "a --> b", "bounds": (0, 1000)}, + ], + ) + remove_duplicate_reactions(m, reserved=["R1"]) + assert "R1" in {r.id for r in m.reactions} # reserved one kept + + +# --- constrain_reversible_reactions --------------------------------------- + +def test_forward_only_reversible_constrained(): + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b")]) + add_reactions_from_equations( + m, + [ + {"id": "R_in", "equation": " --> a", "bounds": (0, 1000)}, + {"id": "R1", "equation": "a <=> b", "bounds": (-1000, 1000)}, # can only go fwd + {"id": "R_out", "equation": "b --> ", "bounds": (0, 1000)}, + ], + ) + changed = constrain_reversible_reactions(m) + assert "R1" in changed + assert m.reactions.get_by_id("R1").lower_bound == 0 # constrained to forward + + +def test_truly_reversible_unchanged(): + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b")]) + add_reactions_from_equations( + m, + [ + {"id": "R_in", "equation": " <=> a", "bounds": (-1000, 1000)}, + {"id": "R1", "equation": "a <=> b", "bounds": (-1000, 1000)}, + {"id": "R_out", "equation": "b <=> ", "bounds": (-1000, 1000)}, + ], + ) + changed = constrain_reversible_reactions(m) + assert "R1" not in changed # can go both ways + + +# --- group_linear_reactions ----------------------------------------------- + +def test_linear_chain_merged(): + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b", "c")]) + add_reactions_from_equations( + m, + [ + {"id": "R1", "equation": "a --> b"}, # b: single producer + {"id": "R2", "equation": "b --> c"}, # b: single consumer + ], + ) + n_before = len(m.reactions) + group_linear_reactions(m) + # b is eliminated; R1+R2 merged into one reaction a --> c + assert "b" not in m.metabolites + assert len(m.reactions) < n_before + + +def test_group_linear_discards_genes(): + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b", "c")]) + add_reactions_from_equations( + m, + [ + {"id": "R1", "equation": "a --> b", "gene_reaction_rule": "G1"}, + {"id": "R2", "equation": "b --> c", "gene_reaction_rule": "G2"}, + ], + ) + group_linear_reactions(m) + assert len(m.genes) == 0 + + +# --- regression: incremental merge collapses a long chain (known_issues.md D1) --- + +def test_group_linear_merges_long_chain_in_one_pass(): + """The incremental scan still flattens a 5-reaction linear chain — the + correctness property the original O(n²·m) restart-after-merge loop had.""" + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in "abcdef"]) + add_reactions_from_equations( + m, + [ + {"id": "R_in", "equation": " --> a"}, + {"id": "R1", "equation": "a --> b"}, + {"id": "R2", "equation": "b --> c"}, + {"id": "R3", "equation": "c --> d"}, + {"id": "R4", "equation": "d --> e"}, + {"id": "R5", "equation": "e --> f"}, + {"id": "R_out", "equation": "f --> "}, + ], + ) + group_linear_reactions(m) + # All the chain's internal metabolites are gone. + assert {x for x in m.metabolites if x.id in {"b", "c", "d", "e"}} == set() + + +# --- regression: NaN FVA on infeasible model (known_issues.md C1) ---------- + +def test_constrain_reversible_raises_on_infeasible(): + """An infeasible model produces NaN FVA ranges; the old abs(NaN) < eps + check silently treated those as 'truly reversible'. Now raises.""" + m = cobra.Model("t") + a, b = (cobra.Metabolite(x, compartment="c") for x in ("a", "b")) + m.add_metabolites([a, b]) + # Force a contradiction: r requires production AND consumption of a, but + # nothing else produces a. + r = cobra.Reaction("r", lower_bound=-1, upper_bound=1) + r.add_metabolites({a: -1, b: 1}) + forced = cobra.Reaction("forced", lower_bound=5, upper_bound=10) # infeasible + forced.add_metabolites({a: -1}) + m.add_reactions([r, forced]) + with pytest.raises(RuntimeError, match="infeasible"): + constrain_reversible_reactions(m) diff --git a/tests/test_manipulation_transfer.py b/tests/test_manipulation_transfer.py new file mode 100644 index 0000000..61c2ac9 --- /dev/null +++ b/tests/test_manipulation_transfer.py @@ -0,0 +1,137 @@ +"""Tests for add_reactions_from_model (addRxnsGenesMets port).""" +import cobra +import pytest + +from raven_python.manipulation import add_reactions_from_equations, add_reactions_from_model + + +@pytest.fixture +def draft(): + m = cobra.Model("draft") + m.add_metabolites( + [cobra.Metabolite("glc_c", name="Glucose", formula="C6H12O6", compartment="c")] + ) + # an existing reaction so glc_c is in use and we have an id to test skipping + add_reactions_from_equations(m, [{"id": "R_existing", "equation": "glc_c <=>"}]) + return m + + +@pytest.fixture +def source(): + m = cobra.Model("source") + m.add_metabolites( + [ + # same name[comp] as draft's glc_c but a DIFFERENT id + cobra.Metabolite("glucose_c", name="Glucose", formula="C6H12O6", compartment="c"), + cobra.Metabolite("atp_c", name="ATP", formula="C10H16N5O13P3", charge=-4, compartment="c"), + cobra.Metabolite("g6p_c", name="G6P", formula="C6H13O9P", compartment="c"), + ] + ) + add_reactions_from_equations( + m, + [ + { + "id": "HEX", + "equation": "glucose_c + atp_c --> g6p_c", + "name": "hexokinase", + "bounds": (0, 1000), + "gene_reaction_rule": "G1", + "subsystem": "glycolysis", + }, + {"id": "R_existing", "equation": "glucose_c <=>"}, # id already in draft + ], + ) + return m + + +def test_metabolite_matched_by_name_comp_not_id(draft, source): + add_reactions_from_model(draft, source, "HEX") + hex_rxn = draft.reactions.get_by_id("HEX") + # Glucose reused from the draft (id glc_c), NOT the source's glucose_c + assert "glc_c" in {m.id for m in hex_rxn.metabolites} + assert "glucose_c" not in draft.metabolites + + +def test_new_metabolites_added_with_metadata(draft, source): + add_reactions_from_model(draft, source, "HEX") + assert "atp_c" in draft.metabolites and "g6p_c" in draft.metabolites + assert draft.metabolites.get_by_id("g6p_c").formula == "C6H13O9P" + assert draft.metabolites.get_by_id("atp_c").charge == -4 + + +def test_reaction_copied_with_bounds_and_name(draft, source): + (rxn,) = add_reactions_from_model(draft, source, "HEX") + assert rxn.id == "HEX" + assert rxn.name == "hexokinase" + assert rxn.bounds == (0, 1000) + assert rxn.subsystem == "glycolysis" + assert {m.id: rxn.get_coefficient(m.id) for m in rxn.metabolites} == { + "glc_c": -1.0, + "atp_c": -1.0, + "g6p_c": 1.0, + } + + +def test_genes_true_copies_gpr_and_creates_genes(draft, source): + add_reactions_from_model(draft, source, "HEX", genes=True) + assert draft.reactions.get_by_id("HEX").gene_reaction_rule == "G1" + assert "G1" in draft.genes + + +def test_genes_false_no_gpr(draft, source): + add_reactions_from_model(draft, source, "HEX", genes=False) + assert draft.reactions.get_by_id("HEX").gene_reaction_rule == "" + + +def test_genes_string_override(draft, source): + add_reactions_from_model(draft, source, "HEX", genes="G9 or G10") + assert draft.reactions.get_by_id("HEX").gene_reaction_rule == "G9 or G10" + + +def test_skips_already_present(draft, source): + added = add_reactions_from_model(draft, source, ["HEX", "R_existing"]) + assert [r.id for r in added] == ["HEX"] + + +def test_all_present_raises(draft, source): + with pytest.raises(ValueError, match="already in the model"): + add_reactions_from_model(draft, source, "R_existing") + + +def test_unknown_source_reaction_raises(draft, source): + with pytest.raises(ValueError, match="not found in the source model"): + add_reactions_from_model(draft, source, "NOPE") + + +def test_note_and_confidence_stored(draft, source): + (rxn,) = add_reactions_from_model(draft, source, "HEX", note="from KEGG", confidence=2) + assert rxn.notes["note"] == "from KEGG" + assert rxn.notes["confidence_score"] == 2 + + +# --- regression: intra-batch met-id minting collision (known_issues.md A3) --- + +def test_intra_batch_id_minting_unique(): + """Two source mets whose ids both collide with the draft and whose name[comp] + differs both get routed through new-id minting. The fix tracks ids minted in + the current batch so the two don't collapse to the same generated id.""" + draft = cobra.Model("draft") + draft.add_metabolites([ + cobra.Metabolite("atp_c", name="ATP-draft", compartment="c"), + cobra.Metabolite("adp_c", name="ADP-draft", compartment="c"), + ]) + source = cobra.Model("source") + source.add_metabolites([ + cobra.Metabolite("atp_c", name="ATP-source", compartment="c"), + cobra.Metabolite("adp_c", name="ADP-source", compartment="c"), + ]) + rxn = cobra.Reaction("R1", lower_bound=0, upper_bound=1000) + source.add_reactions([rxn]) + rxn.add_metabolites({ + source.metabolites.get_by_id("atp_c"): -1, + source.metabolites.get_by_id("adp_c"): 1, + }) + add_reactions_from_model(draft, source, "R1") + # Both source mets minted distinct ids (m1 and m2) — not a collision. + new_ids = sorted(m.id for m in draft.metabolites if m.id not in ("atp_c", "adp_c")) + assert len(new_ids) == 2 and len(set(new_ids)) == 2 diff --git a/tests/test_manipulation_transport.py b/tests/test_manipulation_transport.py new file mode 100644 index 0000000..e8fb2b6 --- /dev/null +++ b/tests/test_manipulation_transport.py @@ -0,0 +1,98 @@ +"""Tests for add_transport_reactions (addTransport port).""" +import cobra +import pytest + +from raven_python.manipulation import add_transport_reactions + + +@pytest.fixture +def model(): + m = cobra.Model("t") + m.compartments = {"c": "cytoplasm", "m": "mitochondrion", "e": "extracellular"} + m.add_metabolites( + [ + cobra.Metabolite("atp_c", name="ATP", formula="C10H16N5O13P3", charge=-4, compartment="c"), + cobra.Metabolite("h2o_c", name="H2O", formula="H2O", compartment="c"), + cobra.Metabolite("atp_m", name="ATP", compartment="m"), # exists in m + ] + ) + return m + + +def test_basic_transport_to_existing(model): + added = add_transport_reactions(model, "c", "m", ["ATP"]) + assert len(added) == 1 + rxn = added[0] + assert rxn.id == "tr_0001" + assert rxn.name == "ATP transport, cytoplasm-mitochondrion" + assert {m.id: rxn.get_coefficient(m.id) for m in rxn.metabolites} == { + "atp_c": -1.0, + "atp_m": 1.0, + } + assert rxn.reversibility is True + + +def test_only_to_existing_skips_missing(model): + # H2O is not in m; with only_to_existing (default) it's skipped + added = add_transport_reactions(model, "c", "m", ["ATP", "H2O"]) + assert [r.id for r in added] == ["tr_0001"] # only ATP + + +def test_creates_missing_target_metabolite(model): + added = add_transport_reactions( + model, "c", "m", ["H2O"], only_to_existing=False + ) + assert len(added) == 1 + new = [mt for mt in model.metabolites if mt.name == "H2O" and mt.compartment == "m"] + assert len(new) == 1 + assert new[0].formula == "H2O" # copied from source + + +def test_copies_formula_and_charge(model): + add_transport_reactions(model, "c", "e", ["ATP"], only_to_existing=False) + new = [mt for mt in model.metabolites if mt.name == "ATP" and mt.compartment == "e"][0] + assert new.formula == "C10H16N5O13P3" + assert new.charge == -4 + + +def test_irreversible(model): + (rxn,) = add_transport_reactions(model, "c", "m", ["ATP"], reversible=False) + assert rxn.lower_bound == 0 + assert rxn.reversibility is False + + +def test_default_all_metabolites_in_from(model): + # default metabolite_names = all in c (ATP, H2O); to m, only_to_existing -> only ATP + added = add_transport_reactions(model, "c", "m") + assert [r.id for r in added] == ["tr_0001"] + + +def test_multiple_target_compartments_and_sequential_ids(model): + added = add_transport_reactions( + model, "c", ["m", "e"], ["ATP"], only_to_existing=False + ) + assert [r.id for r in added] == ["tr_0001", "tr_0002"] + + +def test_unknown_compartment_raises(model): + with pytest.raises(ValueError, match="not in the model"): + add_transport_reactions(model, "x", "m", ["ATP"]) + + +def test_unknown_metabolite_raises(model): + with pytest.raises(ValueError, match="not found in compartment"): + add_transport_reactions(model, "c", "m", ["NOPE"]) + + +# --- regression: duplicate name in compartment (known_issues.md A4) -------- + +def test_duplicate_name_in_source_compartment_warns(model): + """Two source mets sharing a name in the same compartment warn instead of + silently collapsing — previously one was dropped from the lookup dict.""" + model.add_metabolites([ + cobra.Metabolite("h2o2_c", name="H2O", compartment="c"), # duplicate name + ]) + with pytest.warns(UserWarning, match="Multiple metabolites named 'H2O'"): + added = add_transport_reactions(model, "c", "m", ["H2O"], only_to_existing=False) + # Transport still works (uses the first match) — the warning is the signal. + assert len(added) == 1 diff --git a/tests/test_parameters.py b/tests/test_parameters.py new file mode 100644 index 0000000..c0ab06c --- /dev/null +++ b/tests/test_parameters.py @@ -0,0 +1,60 @@ +"""Tests for set_variance_bounds (the var mode of setParam).""" +import cobra +import pytest + +from raven_python.manipulation import add_reactions_from_equations, set_variance_bounds + + +@pytest.fixture +def model(): + m = cobra.Model("t") + m.add_metabolites( + [cobra.Metabolite("a_c", compartment="c"), cobra.Metabolite("b_c", compartment="c")] + ) + add_reactions_from_equations( + m, + [ + {"id": "R1", "equation": "a_c <=> b_c"}, + {"id": "R2", "equation": "a_c <=> b_c"}, + ], + ) + return m + + +def test_band_positive(model): + set_variance_bounds(model, "R1", 100, 5) # 97.5 .. 102.5 + lb, ub = model.reactions.get_by_id("R1").bounds + assert lb == pytest.approx(97.5) + assert ub == pytest.approx(102.5) + + +def test_band_negative_is_ordered(model): + set_variance_bounds(model, "R1", -100, 5) + lb, ub = model.reactions.get_by_id("R1").bounds + assert lb == pytest.approx(-102.5) + assert ub == pytest.approx(-97.5) + assert lb <= ub + + +def test_broadcast_scalar(model): + set_variance_bounds(model, ["R1", "R2"], 50, 10) + for rid in ("R1", "R2"): + lb, ub = model.reactions.get_by_id(rid).bounds + assert lb == pytest.approx(47.5) + assert ub == pytest.approx(52.5) + + +def test_per_reaction_values(model): + set_variance_bounds(model, ["R1", "R2"], [100, 200], 0) + assert model.reactions.get_by_id("R1").bounds == pytest.approx((100, 100)) + assert model.reactions.get_by_id("R2").bounds == pytest.approx((200, 200)) + + +def test_length_mismatch_raises(model): + with pytest.raises(ValueError, match="to match the reactions"): + set_variance_bounds(model, ["R1", "R2"], [1, 2, 3], 5) + + +def test_unknown_reaction_raises(model): + with pytest.raises(ValueError, match="not found"): + set_variance_bounds(model, "NOPE", 1, 5) From a1dc557ece5d83d38466e96d676eeec66f90b1bb Mon Sep 17 00:00:00 2001 From: Eduard Kerkhoven Date: Fri, 29 May 2026 22:50:05 +0200 Subject: [PATCH 5/7] Add binary + data resolvers for external tools and published artefacts --- docs/maintaining_binaries.md | 236 +++++++++++++++++++++++++++++++ scripts/make_registry_snippet.py | 102 +++++++++++++ src/raven_python/binaries.py | 148 +++++++++++++++++++ src/raven_python/data.py | 135 ++++++++++++++++++ tests/test_binaries.py | 80 +++++++++++ tests/test_data.py | 89 ++++++++++++ tests/test_scripts_registry.py | 58 ++++++++ 7 files changed, 848 insertions(+) create mode 100644 docs/maintaining_binaries.md create mode 100644 scripts/make_registry_snippet.py create mode 100644 src/raven_python/binaries.py create mode 100644 src/raven_python/data.py create mode 100644 tests/test_binaries.py create mode 100644 tests/test_data.py create mode 100644 tests/test_scripts_registry.py diff --git a/docs/maintaining_binaries.md b/docs/maintaining_binaries.md new file mode 100644 index 0000000..df5b315 --- /dev/null +++ b/docs/maintaining_binaries.md @@ -0,0 +1,236 @@ +# Maintaining bundled binaries (BLAST+, DIAMOND, …) + +Audience: **raven-python maintainers / the GitHub repo owner.** This explains how +raven-python ships external command-line tools, how to update their versions, and how +to build **minimal-footprint** ZIPs to attach to a GitHub release. + +> End users never read this. They get a binary automatically via `ensure_binary`, +> or use their own (system/conda) install. This doc is only for whoever publishes +> the release assets. + +--- + +## 1. How binary provisioning works + +raven-python does **not** vendor binaries in the git repo or on PyPI. Instead: + +1. For each tool we publish **version-pinned ZIPs as GitHub release assets**. +2. A **registry** (`src/raven_python/binaries_registry.json`) maps each *bundle* to its + version, the executables it provides, and per-platform `{asset, sha256}`. +3. At run time `raven_python.binaries.ensure_binary("blastp")` resolves a tool in this + order — and only reaches the download as a last resort: + + ``` + explicit binary= arg → env var (RAVEN_PYTHON_BLASTP / RAVEN_PYTHON_DIAMOND / …) + → shutil.which on PATH (system / conda / apt / brew) + → ensure_binary: download the pinned ZIP → verify SHA256 → cache → return path + → actionable error (with conda / manual instructions) + ``` + +So a pre-installed binary always wins; the bundle is the zero-setup fallback. +Pinning the version makes reconstruction **reproducible**. + +A *bundle* can provide several executables from one download (e.g. the `blast` +bundle provides both `blastp` and `makeblastdb`), so they are fetched once. + +--- + +## 2. What raven-python actually needs — ship only these + +Distribute the **minimum** set of executables. Everything else (other suite +tools, docs, examples, changelogs) must be excluded. + +| Bundle | Executables to include | Everything else | +|---|---|---| +| `diamond` | `diamond` | — (it is a single static binary) | +| `blast` | `blastp`, `makeblastdb` | **drop** `blastn`, `tblastn`, `psiblast`, `rpsblast`, `blast_formatter`, `*_vdb`, the `doc/`, `ChangeLog`, `README`, ~30 other tools | + +(Confirmed against RAVEN `getBlast`/`getDiamond`: only `makeblastdb`+`blastp`, and +`diamond` for its `makedb`/`blastp` subcommands, are ever invoked.) + +For BLAST+ this is the big win: the full NCBI suite is ~hundreds of MB; two +binaries (stripped) are a small fraction. + +--- + +## 3. Asset & ZIP conventions + +**Asset filename:** `---.zip` + +- `` ∈ `linux`, `macos`, `windows` +- `` ∈ `x86_64`, `arm64` +- examples: `diamond-2.1.11-linux-x86_64.zip`, `blast-2.16.0-macos-arm64.zip` + +**ZIP layout — flat, executables at the root, plus the upstream licence:** + +``` +diamond-2.1.11-linux-x86_64.zip +├── diamond +└── LICENSE + +blast-2.16.0-linux-x86_64.zip +├── blastp +├── makeblastdb +└── LICENSE +``` + +No nested `bin/`, no extra files. `ensure_binary` extracts the ZIP into the cache +and expects the executable at the top level. + +--- + +## 4. Step-by-step: add or update a version + +Example: bump DIAMOND to a new version for Linux x86-64. Repeat per `(os, arch)`. + +1. **Download the official upstream build** (never rebuild from source unless you + must): + - DIAMOND → + (`diamond-linux64.tar.gz`, `diamond-macos.tar.gz`) + - BLAST+ → or a + pinned version dir (`ncbi-blast-+-x64-linux.tar.gz`, + `-x64-macosx.tar.gz`, `-aarch64-linux.tar.gz`, `-x64-win64.tar.gz`). + - Record the upstream URL **and** its published checksum for provenance. +2. **Extract only the needed executables** (see §2) to a clean staging dir. +3. **Strip debug symbols** to shrink (skip on Windows / signed macOS builds): + ```bash + strip diamond # or: strip blastp makeblastdb + ``` +4. **Smoke-test the stripped binaries in a clean shell** (no other tools on PATH): + ```bash + ./diamond --version + ./blastp -version && ./makeblastdb -version + ``` + If they fail for a missing shared library, add that `.so`/`.dylib` to the ZIP + (rare — NCBI/DIAMOND release builds are largely self-contained). +5. **Add the upstream licence file** as `LICENSE` (see §6). +6. **Zip with max compression, flat layout:** + ```bash + zip -9 -j diamond-2.1.11-linux-x86_64.zip diamond LICENSE + # -j junks paths so entries sit at the ZIP root + ``` +7. **Compute the SHA256:** + ```bash + sha256sum diamond-2.1.11-linux-x86_64.zip # shasum -a 256 on macOS + ``` +8. **Attach the ZIP to a raven-python GitHub release** (a release tagged for the binary + set, e.g. `binaries-2024.06`, keeps them independent of code releases). +9. **Update the registry** `src/raven_python/binaries_registry.json` — bump `version` + and set the per-platform `asset` + `sha256`: + ```json + { + "diamond": { + "version": "2.1.11", + "provides": ["diamond"], + "platforms": { + "linux-x86_64": { + "asset": "diamond-2.1.11-linux-x86_64.zip", + "url": "https://github.com/SysBioChalmers/raven-python/releases/download/binaries-2024.06/diamond-2.1.11-linux-x86_64.zip", + "sha256": "" + } + } + }, + "blast": { + "version": "2.16.0", + "provides": ["blastp", "makeblastdb"], + "platforms": { "linux-x86_64": { "asset": "...", "url": "...", "sha256": "..." } } + } + } + ``` +10. **Commit the registry change**, run the homology tests, and (if you have the + binary) confirm `ensure_binary("diamond", version="2.1.11")` downloads, + verifies, and runs. + +--- + +## 5. Keeping the footprint minimal — checklist + +- ✅ Only the executables in §2 (for BLAST+, exactly `blastp` + `makeblastdb`). +- ✅ `strip` the binaries (often halves their size). +- ✅ `zip -9 -j` (max compression, flat — no `bin/`, no folders). +- ✅ Exactly one extra file: `LICENSE`. +- ❌ No docs, examples, `ChangeLog`, `README`, man pages, test data, or sibling tools. +- ❌ No `.dSYM`/debug bundles; no duplicate static `.a` libraries. +- ➕ Only add a shared library if step-4 testing proves it is required. + +--- + +## 6. Platform / architecture matrix & licensing + +**Coverage = what you build.** Start with `linux-x86_64` (CI default), then add +`macos-arm64`, `macos-x86_64`, `linux-arm64`, `windows-x86_64` as capacity allows. +For any `(os, arch)` **not** in the registry, `ensure_binary` raises an actionable +error pointing to conda (`conda install -c bioconda diamond blast`) or a manual +install — that is the documented fallback, not a failure to fix urgently. + +**Licensing (must comply when redistributing):** + +- **BLAST+** — produced by NCBI (US Government); **public domain**, free to + redistribute. Include NCBI's `LICENSE` for courtesy/provenance. +- **DIAMOND** — **GPLv3**. Redistribution is allowed; you **must** include the + GPLv3 licence text in the ZIP and keep the binary unmodified (or offer source). +- **HMMER** (future) — BSD-3-Clause; include its `LICENSE`. + +Always ship the upstream licence in the ZIP, and keep a `BINARIES_PROVENANCE.md` +(or a note in the release body) recording, per asset: upstream URL, upstream +version, upstream checksum, and the SHA256 you published. + +### Native OS support per tool + +raven-python invokes each tool through `subprocess.run([resolved_path, …])` — that +call is itself cross-platform, so the real constraint is whether a given tool has +a binary that runs natively on each OS. It varies: + +| Tool | Linux | macOS (incl. arm64) | Windows (native) | +|---|---|---|---| +| BLAST+ (`blastp`, `makeblastdb`) | ✅ | ✅ | ✅ (NCBI ships Windows builds) | +| DIAMOND | ✅ | ✅ | ⚠️ native build exists but Linux-first | +| HMMER (`hmmbuild`/`hmmpress`/`hmmsearch`/`hmmscan`) | ✅ | ✅ | ❌ no official native build | +| MAFFT | ✅ | ✅ | ⚠️ Windows package is a wrapper | +| CD-HIT | ✅ | ✅ | ❌ no Windows build exists | + +Implications: + +- **Linux / macOS** — everything works. `conda install -c bioconda hmmer mafft + cd-hit blast diamond`, or point the `RAVEN_PYTHON_*` env vars at your installs. +- **Native Windows** — the homology track (BLAST+/DIAMOND) works, but the **KEGG + HMM build (3b.3) and HMM query (3b.5) do not**: HMMER and CD-HIT have no Windows + binaries, and bioconda has no Windows packages for any of them. Bundling can't + fix this — there is no binary to bundle. +- **Windows users should run raven-python inside WSL2** (or a Linux container), where + every tool is native Linux. raven-python does **not** replicate RAVEN's + `getWSLpath`/`wsl …` path translation: it calls the resolved binary directly, so + mixing native-Windows Python with WSL binaries is unsupported — keep the whole + stack inside WSL2. +- The common end-user paths — homology reconstruction and the KEGG *species* model + (3b.4) — need no HMMER/MAFFT/CD-HIT, so they are fully cross-platform. + +--- + +## 7. Emitting the registry entry + +After building the per-platform ZIPs (named `---.zip`) +and uploading them to the release, generate the `_REGISTRY` entry — checksums and +URLs — with [`scripts/make_registry_snippet.py`](../scripts/README.md): + +```bash +python scripts/make_registry_snippet.py binary --bundle blast --version 2.16.0 \ + --provides blastp makeblastdb --dir zips \ + --base-url https://github.com/ORG/raven-python/releases/download/blast-2.16.0 +``` + +It prints the ready-to-paste `_REGISTRY["blast"]` block; its SHA256 helper is the +same one `ensure_binary` verifies with, so the checksums always match. (Producing +the minimal ZIPs themselves — download upstream, `strip`, `zip -9 -j`, add +`LICENSE` per §3–§6 — is still a manual/per-tool step.) + +--- + +## 8. Adding a new tool later (e.g. HMMER for KEGG reconstruction) + +1. Decide the **minimal executable set** (e.g. HMMER → `hmmsearch`, `hmmscan`, + maybe `hmmbuild`/`hmmpress`). +2. Add a bundle entry to the registry with `provides` listing those executables. +3. Build/attach ZIPs per §3–§4; include the tool's licence (§6). +4. The wrappers call `ensure_binary("hmmsearch", …)` with the same resolution + order — no new provisioning code needed. diff --git a/scripts/make_registry_snippet.py b/scripts/make_registry_snippet.py new file mode 100644 index 0000000..3efa49e --- /dev/null +++ b/scripts/make_registry_snippet.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python +"""Emit ready-to-paste registry entries for published artefacts / binary ZIPs. + +Computes the SHA256 of each file and prints the Python/JSON entry to merge into +``raven_python.data._DATA_REGISTRY`` (data artefacts) or ``raven_python.binaries._REGISTRY`` +(binary bundles). Run once per release, after uploading the files to the release. + +Examples +-------- +Data artefacts (KEGG reference model + tables + HMM libraries) for one release:: + + python scripts/make_registry_snippet.py data \\ + --dataset kegg --version kegg116 --dir artefacts \\ + --base-url https://github.com/ORG/raven_python/releases/download/kegg-data-kegg116 + +Binary bundle (one ZIP per platform, named ``---.zip``):: + + python scripts/make_registry_snippet.py binary \\ + --bundle blast --version 2.16.0 --provides blastp makeblastdb --dir zips \\ + --base-url https://github.com/ORG/raven_python/releases/download/blast-2.16.0 + +The SHA256 helper is shared with the runtime resolvers (``raven_python.binaries``), so +published checksums always match what ``ensure_data`` / ``ensure_binary`` verify. +""" +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +from raven_python.binaries import _sha256 + + +def _files_in(directory: Path) -> list[Path]: + """Regular, non-hidden files in ``directory``, sorted by name.""" + return sorted(p for p in directory.iterdir() if p.is_file() and not p.name.startswith(".")) + + +def data_entry(dataset: str, version: str, base_url: str, directory: Path) -> dict: + """Build the ``_DATA_REGISTRY[dataset]`` entry for every file in ``directory``.""" + base = base_url.rstrip("/") + files = { + p.name: {"url": f"{base}/{p.name}", "sha256": _sha256(p)} for p in _files_in(directory) + } + if not files: + raise SystemExit(f"No files found in {directory}") + return {"version": version, "files": files} + + +def binary_entry( + bundle: str, version: str, provides: list[str], base_url: str, directory: Path +) -> dict: + """Build the ``_REGISTRY[bundle]`` entry from ``---.zip``.""" + base = base_url.rstrip("/") + prefix = f"{bundle}-{version}-" + platforms = {} + for zip_path in directory.glob(f"{prefix}*.zip"): + platform = zip_path.name[len(prefix) : -len(".zip")] + platforms[platform] = {"url": f"{base}/{zip_path.name}", "sha256": _sha256(zip_path)} + if not platforms: + raise SystemExit(f"No {prefix}*.zip files found in {directory}") + return {"version": version, "provides": provides, "platforms": dict(sorted(platforms.items()))} + + +def render(key: str, entry: dict) -> str: + """Render ``{key: entry}`` as an indented JSON block (valid Python to paste).""" + return json.dumps({key: entry}, indent=4) + + +def main(argv: list[str] | None = None) -> None: + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + sub = parser.add_subparsers(dest="kind", required=True) + + d = sub.add_parser("data", help="data-artefact registry entry (raven_python.data)") + d.add_argument("--dataset", required=True, help="dataset key, e.g. 'kegg'") + d.add_argument("--version", required=True) + d.add_argument("--dir", required=True, type=Path, help="directory of uploaded artefacts") + d.add_argument("--base-url", required=True, help="release download URL prefix") + + b = sub.add_parser("binary", help="binary-bundle registry entry (raven_python.binaries)") + b.add_argument("--bundle", required=True, help="bundle key, e.g. 'blast'") + b.add_argument("--version", required=True) + b.add_argument("--provides", nargs="+", required=True, help="executables the bundle provides") + b.add_argument("--dir", required=True, type=Path, help="directory of uploaded ZIPs") + b.add_argument("--base-url", required=True, help="release download URL prefix") + + args = parser.parse_args(argv) + if args.kind == "data": + key, entry = args.dataset, data_entry(args.dataset, args.version, args.base_url, args.dir) + target = "raven_python/data.py _DATA_REGISTRY" + else: + key = args.bundle + entry = binary_entry(args.bundle, args.version, args.provides, args.base_url, args.dir) + target = "raven_python/binaries.py _REGISTRY" + + print(f"# Merge into {target}:", file=sys.stderr) + print(render(key, entry)) + + +if __name__ == "__main__": + main() diff --git a/src/raven_python/binaries.py b/src/raven_python/binaries.py new file mode 100644 index 0000000..2d4b5a2 --- /dev/null +++ b/src/raven_python/binaries.py @@ -0,0 +1,148 @@ +"""Locate and provision external command-line binaries (BLAST+, DIAMOND, …). + +Shared across tools (not homology-specific). Resolution order for any executable: + + explicit path arg → env var (RAVEN_PYTHON_) → shutil.which (PATH) + → ensure_binary (download the version-pinned ZIP from a raven_python release, + verify SHA256, cache, return the path) + → FileNotFoundError with install guidance + +So a pre-installed/conda binary always wins; the bundled ZIP is the zero-setup +fallback. See docs/maintaining_binaries.md for how the release ZIPs and the +registry are produced and updated. +""" +from __future__ import annotations + +import hashlib +import os +import platform +import shutil +import zipfile +from pathlib import Path +from urllib.request import urlopen + +# Registry of bundled binaries. Empty until release ZIPs are published; populated +# per docs/maintaining_binaries.md. Keyed by *bundle*; one bundle can provide +# several executables (e.g. "blast" -> blastp + makeblastdb). +# bundle -> {version, provides:[exe...], platforms:{"-": {url, sha256}}} +_REGISTRY: dict = {} + +# Environment variable overrides per executable. +_ENV_VARS = { + "diamond": "RAVEN_PYTHON_DIAMOND", + "blastp": "RAVEN_PYTHON_BLASTP", + "makeblastdb": "RAVEN_PYTHON_MAKEBLASTDB", + "hmmbuild": "RAVEN_PYTHON_HMMBUILD", + "hmmpress": "RAVEN_PYTHON_HMMPRESS", + "hmmsearch": "RAVEN_PYTHON_HMMSEARCH", + "hmmscan": "RAVEN_PYTHON_HMMSCAN", + "mafft": "RAVEN_PYTHON_MAFFT", + "cd-hit": "RAVEN_PYTHON_CDHIT", +} + + +def platform_key() -> str: + """Return the ``-`` key used in the registry (e.g. ``linux-x86_64``).""" + system = {"linux": "linux", "darwin": "macos", "windows": "windows"}.get( + platform.system().lower(), platform.system().lower() + ) + machine = platform.machine().lower() + arch = {"x86_64": "x86_64", "amd64": "x86_64", "arm64": "arm64", "aarch64": "arm64"}.get( + machine, machine + ) + return f"{system}-{arch}" + + +def _cache_dir() -> Path: + base = os.environ.get("XDG_CACHE_HOME") or (Path.home() / ".cache") + return Path(base) / "raven_python" / "binaries" + + +def _bundle_for(executable: str, registry: dict): + for name, bundle in registry.items(): + if executable in bundle.get("provides", []): + return name, bundle + return None, None + + +def _sha256(path: Path) -> str: + h = hashlib.sha256() + with open(path, "rb") as fh: + for chunk in iter(lambda: fh.read(1 << 20), b""): + h.update(chunk) + return h.hexdigest() + + +def ensure_binary(executable: str, *, registry: dict | None = None) -> Path: + """Download (if needed) and return the path to a bundled ``executable``. + + Consults the registry for the current platform, downloads the pinned ZIP, + verifies its SHA256, extracts it into the cache, and returns the executable + path. Raises ``FileNotFoundError`` if no bundle for this platform is hosted. + """ + registry = _REGISTRY if registry is None else registry + bundle_name, bundle = _bundle_for(executable, registry) + if bundle is None: + raise FileNotFoundError( + f"No bundled binary registered for {executable!r}. Install it (e.g. " + f"`conda install -c bioconda {executable}`) or pass an explicit path." + ) + key = platform_key() + entry = bundle.get("platforms", {}).get(key) + if entry is None: + raise FileNotFoundError( + f"No bundled {executable!r} for platform {key!r}. Install it " + f"(e.g. `conda install -c bioconda {executable}`), set " + f"{_ENV_VARS.get(executable, 'the binary path')}, or pass binary=." + ) + + dest_dir = _cache_dir() / f"{bundle_name}-{bundle['version']}-{key}" + exe = dest_dir / executable + if exe.exists(): + return exe + + dest_dir.mkdir(parents=True, exist_ok=True) + archive = dest_dir / "_download.zip" + # Download into a sibling .part file and rename on success — an interrupted + # download leaves the partial behind .part, never as a half-complete .zip + # that a later run might mistake for a finished one. Mirrors data.py. + part = archive.with_suffix(archive.suffix + ".part") + try: + with urlopen(entry["url"]) as resp, open(part, "wb") as out: # noqa: S310 + shutil.copyfileobj(resp, out) + digest = _sha256(part) + if digest != entry["sha256"]: + raise ValueError( + f"SHA256 mismatch for {executable!r} ({key}): " + f"expected {entry['sha256']}, got {digest}." + ) + os.replace(part, archive) + finally: + part.unlink(missing_ok=True) + with zipfile.ZipFile(archive) as zf: + zf.extractall(dest_dir) + archive.unlink(missing_ok=True) + if not exe.exists(): + raise FileNotFoundError(f"{executable!r} not found in the extracted bundle at {dest_dir}.") + exe.chmod(0o755) + return exe + + +def resolve_binary(executable: str, *, binary: str | os.PathLike | None = None) -> str: + """Resolve an executable to a path: arg → env var → PATH → bundled ZIP → error.""" + if binary is not None: + return os.fspath(binary) + env_var = _ENV_VARS.get(executable) + if env_var and os.environ.get(env_var): + return os.environ[env_var] + found = shutil.which(executable) + if found: + return found + try: + return os.fspath(ensure_binary(executable)) + except FileNotFoundError as exc: + raise FileNotFoundError( + f"Could not find {executable!r}. Install it (e.g. " + f"`conda install -c bioconda {executable}`), put it on PATH, set " + f"{env_var or 'the binary path'}, or pass binary=. ({exc})" + ) from exc diff --git a/src/raven_python/data.py b/src/raven_python/data.py new file mode 100644 index 0000000..b1264be --- /dev/null +++ b/src/raven_python/data.py @@ -0,0 +1,135 @@ +"""Fetch and cache published data artefacts (KEGG reference model, tables, HMMs). + +The mirror of :mod:`raven_python.binaries` for *data*: a version-pinned registry of +downloadable artefacts, fetched on first use, SHA256-verified, and cached under +platformdirs so end users never rebuild them from a KEGG dump (that is the +maintainer's job — see docs/maintaining_kegg_data.md). + +Resolution for any artefact file: + + explicit local dir → cached copy → download from the registry (verify, + cache) → FileNotFoundError with guidance + +The registry is **empty until the artefacts are published** (same as +``binaries._REGISTRY``); until then ``ensure_data_file`` raises an actionable +error. Cache layout:: + + $XDG_CACHE_HOME/raven_python/data/-/ + (or ~/.cache/raven_python/data/... if XDG_CACHE_HOME is unset) +""" +from __future__ import annotations + +import os +import shutil +from pathlib import Path +from urllib.request import urlopen + +from raven_python.binaries import _sha256 + +# dataset -> {"version": str, "files": {filename: {"url": str, "sha256": str}}} +# Populated when raven_python publishes the KEGG artefacts as release assets. +_DATA_REGISTRY: dict = {} + +# The core KEGG artefacts needed to build a model (no HMM libraries). +CORE_KEGG_FILES = ( + "reference_model.yml.gz", + "ko_reaction.tsv.gz", + "ko_names.tsv.gz", + "organism_gene_ko.tsv.xz", + "rxn_flags.tsv.gz", +) + + +def _data_cache_dir() -> Path: + base = os.environ.get("XDG_CACHE_HOME") or (Path.home() / ".cache") + return Path(base) / "raven_python" / "data" + + +def _bundle(dataset: str, registry: dict) -> dict: + bundle = registry.get(dataset) + if bundle is None: + raise FileNotFoundError( + f"No data artefacts registered for {dataset!r}. Either pass a local " + f"directory of artefacts, or build them per docs/maintaining_kegg_data.md." + ) + return bundle + + +def ensure_data_file( + dataset: str, + filename: str, + *, + version: str | None = None, + registry: dict | None = None, +) -> Path: + """Download (if needed) and return the cached path to one artefact file. + + Looks the file up in the registry for ``dataset`` (at ``version`` or the + registry's default), downloads it to the version-pinned cache directory, + verifies its SHA256, and returns the path. Re-uses an already-cached copy. + """ + registry = _DATA_REGISTRY if registry is None else registry + bundle = _bundle(dataset, registry) + ver = version or bundle["version"] + entry = bundle.get("files", {}).get(filename) + if entry is None: + raise FileNotFoundError( + f"{filename!r} is not registered for {dataset!r} {ver}. " + f"Available: {sorted(bundle.get('files', {}))}." + ) + + dest_dir = _data_cache_dir() / f"{dataset}-{ver}" + dest = dest_dir / filename + if dest.exists(): + return dest + + dest_dir.mkdir(parents=True, exist_ok=True) + tmp = dest.with_name(dest.name + ".part") + with urlopen(entry["url"]) as resp, open(tmp, "wb") as out: # noqa: S310 (trusted registry URLs) + shutil.copyfileobj(resp, out) + digest = _sha256(tmp) + if digest != entry["sha256"]: + tmp.unlink(missing_ok=True) + raise ValueError( + f"SHA256 mismatch for {dataset}/{filename} ({ver}): " + f"expected {entry['sha256']}, got {digest}." + ) + tmp.replace(dest) + return dest + + +def ensure_kegg_data( + *, + version: str | None = None, + files: tuple[str, ...] = CORE_KEGG_FILES, + registry: dict | None = None, +) -> Path: + """Ensure the core KEGG artefacts are cached; return their directory. + + Fetches each of ``files`` (default :data:`CORE_KEGG_FILES`) for the ``kegg`` + dataset and returns the cache directory holding them — ready to pass as the + ``artefact_dir`` of :func:`get_kegg_model_for_organism_from_artefacts`. + """ + registry = _DATA_REGISTRY if registry is None else registry + ver = version or _bundle("kegg", registry)["version"] + for filename in files: + ensure_data_file("kegg", filename, version=ver, registry=registry) + return _data_cache_dir() / f"kegg-{ver}" + + +def ensure_kegg_hmm_library( + domain: str, *, version: str | None = None, registry: dict | None = None +) -> Path: + """Ensure a domain HMM library (and its hmmpress index) is cached; return its path. + + ``domain`` is ``"prokaryotes"`` or ``"eukaryotes"``. Fetches ``.hmm`` + plus the ``hmmpress`` sidecar files (``.h3f/.h3i/.h3m/.h3p``) and returns the + path to the ``.hmm`` (the argument for :func:`run_hmmscan`). + """ + registry = _DATA_REGISTRY if registry is None else registry + ver = version or _bundle("kegg", registry)["version"] + base = f"{domain}.hmm" + library = ensure_data_file("kegg", base, version=ver, registry=registry) + for suffix in (".h3f", ".h3i", ".h3m", ".h3p"): + ensure_data_file("kegg", base + suffix, version=ver, registry=registry) + return library diff --git a/tests/test_binaries.py b/tests/test_binaries.py new file mode 100644 index 0000000..d74ce0b --- /dev/null +++ b/tests/test_binaries.py @@ -0,0 +1,80 @@ +"""Tests for raven_python.binaries (binary resolution + bundled-ZIP provisioning).""" +import hashlib +import shutil +import zipfile +from pathlib import Path + +import pytest + +from raven_python import binaries + + +def test_resolve_explicit_path(): + assert binaries.resolve_binary("blastp", binary="/opt/x/blastp") == "/opt/x/blastp" + + +def test_resolve_env_var(monkeypatch): + monkeypatch.setenv("RAVEN_PYTHON_DIAMOND", "/custom/diamond") + assert binaries.resolve_binary("diamond") == "/custom/diamond" + + +@pytest.mark.skipif(not shutil.which("blastp"), reason="blastp not installed") +def test_resolve_via_path(): + assert binaries.resolve_binary("blastp") == shutil.which("blastp") + + +def test_resolve_unresolvable_raises(monkeypatch): + monkeypatch.setattr(shutil, "which", lambda _: None) + with pytest.raises(FileNotFoundError, match="Could not find"): + binaries.resolve_binary("diamond") # empty registry, not on PATH + + +def test_platform_key_format(): + key = binaries.platform_key() + assert "-" in key + os_part, arch = key.split("-", 1) + assert os_part in {"linux", "macos", "windows"} or os_part # tolerant + + +def test_ensure_binary_downloads_verifies_extracts(tmp_path, monkeypatch): + # Build a fake bundle ZIP containing an executable, served via file:// URL. + exe = tmp_path / "footool" + exe.write_text("#!/bin/sh\necho hi\n") + archive = tmp_path / "footool.zip" + with zipfile.ZipFile(archive, "w") as zf: + zf.write(exe, "footool") + sha = hashlib.sha256(archive.read_bytes()).hexdigest() + + registry = { + "footool": { + "version": "1.0", + "provides": ["footool"], + "platforms": {binaries.platform_key(): {"url": archive.as_uri(), "sha256": sha}}, + } + } + monkeypatch.setenv("XDG_CACHE_HOME", str(tmp_path / "cache")) + + path = binaries.ensure_binary("footool", registry=registry) + assert Path(path).exists() + assert Path(path).name == "footool" + # cached on second call (same path, no re-download needed) + assert binaries.ensure_binary("footool", registry=registry) == path + + +def test_ensure_binary_sha_mismatch(tmp_path, monkeypatch): + archive = tmp_path / "x.zip" + with zipfile.ZipFile(archive, "w") as zf: + zf.writestr("footool", "data") + registry = { + "footool": {"version": "1", "provides": ["footool"], + "platforms": {binaries.platform_key(): {"url": archive.as_uri(), "sha256": "deadbeef"}}} + } + monkeypatch.setenv("XDG_CACHE_HOME", str(tmp_path / "cache")) + with pytest.raises(ValueError, match="SHA256 mismatch"): + binaries.ensure_binary("footool", registry=registry) + + +def test_ensure_binary_unhosted_platform_raises(tmp_path): + registry = {"footool": {"version": "1", "provides": ["footool"], "platforms": {}}} + with pytest.raises(FileNotFoundError, match="No bundled"): + binaries.ensure_binary("footool", registry=registry) diff --git a/tests/test_data.py b/tests/test_data.py new file mode 100644 index 0000000..714c3a9 --- /dev/null +++ b/tests/test_data.py @@ -0,0 +1,89 @@ +"""Tests for ensure_data (data.py). Uses file:// URLs to avoid the network.""" +import hashlib + +import pytest + +from raven_python.data import ( + CORE_KEGG_FILES, + ensure_data_file, + ensure_kegg_data, +) + + +def _sha256(data: bytes) -> str: + return hashlib.sha256(data).hexdigest() + + +@pytest.fixture +def served(tmp_path, monkeypatch): + """A fake registry served from local files, with the cache pointed at tmp.""" + src = tmp_path / "src" + src.mkdir() + payloads = { + "reference_model.yml.gz": b"!!omap model bytes", + "ko_reaction.tsv.gz": b"ko\treaction\n", + "ko_names.tsv.gz": b"ko\tname\n", + "organism_gene_ko.tsv.xz": b"organism\tgene\tko\n", + "rxn_flags.tsv.gz": b"reaction\tspontaneous\n", + } + files = {} + for name, data in payloads.items(): + path = src / name + path.write_bytes(data) + files[name] = {"url": path.as_uri(), "sha256": _sha256(data)} + registry = {"kegg": {"version": "v1", "files": files}} + + cache = tmp_path / "cache" + monkeypatch.setenv("XDG_CACHE_HOME", str(cache)) + return registry, cache, payloads + + +def test_ensure_data_file_downloads_and_caches(served): + registry, cache, payloads = served + path = ensure_data_file("kegg", "ko_reaction.tsv.gz", registry=registry) + assert path == cache / "raven_python" / "data" / "kegg-v1" / "ko_reaction.tsv.gz" + assert path.read_bytes() == payloads["ko_reaction.tsv.gz"] + + +def test_ensure_data_file_reuses_cache(served, monkeypatch): + registry, _, _ = served + first = ensure_data_file("kegg", "ko_names.tsv.gz", registry=registry) + # Break the URL: a second call must hit the cache, not re-download. + registry["kegg"]["files"]["ko_names.tsv.gz"]["url"] = "file:///nonexistent" + second = ensure_data_file("kegg", "ko_names.tsv.gz", registry=registry) + assert first == second and second.exists() + + +def test_sha256_mismatch_rejected(served): + registry, cache, _ = served + registry["kegg"]["files"]["rxn_flags.tsv.gz"]["sha256"] = "0" * 64 + with pytest.raises(ValueError, match="SHA256 mismatch"): + ensure_data_file("kegg", "rxn_flags.tsv.gz", registry=registry) + # The corrupt partial download must not be left behind. + assert not (cache / "raven_python" / "data" / "kegg-v1" / "rxn_flags.tsv.gz").exists() + + +def test_unknown_dataset_actionable_error(served): + registry, _, _ = served + with pytest.raises(FileNotFoundError, match="No data artefacts registered"): + ensure_data_file("metacyc", "x", registry=registry) + + +def test_unknown_file_lists_available(served): + registry, _, _ = served + with pytest.raises(FileNotFoundError, match="not registered"): + ensure_data_file("kegg", "missing.tsv.gz", registry=registry) + + +def test_ensure_kegg_data_fetches_core_set(served): + registry, cache, _ = served + out = ensure_kegg_data(registry=registry) + assert out == cache / "raven_python" / "data" / "kegg-v1" + for name in CORE_KEGG_FILES: + assert (out / name).is_file() + + +def test_empty_registry_raises(): + # The shipped registry is empty until artefacts are published. + with pytest.raises(FileNotFoundError, match="No data artefacts registered"): + ensure_data_file("kegg", "ko_reaction.tsv.gz") diff --git a/tests/test_scripts_registry.py b/tests/test_scripts_registry.py new file mode 100644 index 0000000..c9c03cf --- /dev/null +++ b/tests/test_scripts_registry.py @@ -0,0 +1,58 @@ +"""Tests for scripts/make_registry_snippet.py registry-entry helpers.""" +import hashlib +import importlib.util +import json +from pathlib import Path + +import pytest + +# scripts/ is not a package; load the module directly by path. +_SCRIPT = Path(__file__).resolve().parents[1] / "scripts" / "make_registry_snippet.py" +_spec = importlib.util.spec_from_file_location("make_registry_snippet", _SCRIPT) +mrs = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(mrs) + + +def _sha(data: bytes) -> str: + return hashlib.sha256(data).hexdigest() + + +def test_data_entry_lists_files_with_urls_and_checksums(tmp_path): + (tmp_path / "reference_model.yml.gz").write_bytes(b"model") + (tmp_path / "ko_reaction.tsv.gz").write_bytes(b"table") + (tmp_path / ".hidden").write_bytes(b"skip") # hidden files ignored + + entry = mrs.data_entry("kegg", "kegg116", "https://x/rel/", tmp_path) + assert entry["version"] == "kegg116" + assert set(entry["files"]) == {"reference_model.yml.gz", "ko_reaction.tsv.gz"} + ref = entry["files"]["reference_model.yml.gz"] + assert ref["url"] == "https://x/rel/reference_model.yml.gz" # trailing slash collapsed + assert ref["sha256"] == _sha(b"model") + + +def test_data_entry_empty_dir_errors(tmp_path): + with pytest.raises(SystemExit): + mrs.data_entry("kegg", "v1", "https://x", tmp_path) + + +def test_binary_entry_parses_platform_from_filename(tmp_path): + (tmp_path / "blast-2.16.0-linux-x86_64.zip").write_bytes(b"linux") + (tmp_path / "blast-2.16.0-macos-arm64.zip").write_bytes(b"mac") + (tmp_path / "other-1.0-linux-x86_64.zip").write_bytes(b"nope") # different bundle + + entry = mrs.binary_entry("blast", "2.16.0", ["blastp", "makeblastdb"], "https://x", tmp_path) + assert entry["provides"] == ["blastp", "makeblastdb"] + assert set(entry["platforms"]) == {"linux-x86_64", "macos-arm64"} + assert entry["platforms"]["macos-arm64"]["sha256"] == _sha(b"mac") + assert entry["platforms"]["linux-x86_64"]["url"].endswith("blast-2.16.0-linux-x86_64.zip") + + +def test_binary_entry_no_zips_errors(tmp_path): + with pytest.raises(SystemExit): + mrs.binary_entry("blast", "2.16.0", ["blastp"], "https://x", tmp_path) + + +def test_render_is_valid_json_round_trip(): + entry = {"version": "v1", "files": {"a": {"url": "u", "sha256": "s"}}} + text = mrs.render("kegg", entry) + assert json.loads(text) == {"kegg": entry} From 6ef3357aeee811cde778174aa98887327f1e62f7 Mon Sep 17 00:00:00 2001 From: Eduard Kerkhoven Date: Fri, 29 May 2026 22:50:36 +0200 Subject: [PATCH 6/7] Add YAML and SIF model I/O --- src/raven_python/io/__init__.py | 15 +++ src/raven_python/io/sif.py | 96 +++++++++++++++ src/raven_python/io/yaml.py | 191 ++++++++++++++++++++++++++++++ tests/test_io_sif.py | 82 +++++++++++++ tests/test_io_yaml.py | 202 ++++++++++++++++++++++++++++++++ 5 files changed, 586 insertions(+) create mode 100644 src/raven_python/io/__init__.py create mode 100644 src/raven_python/io/sif.py create mode 100644 src/raven_python/io/yaml.py create mode 100644 tests/test_io_sif.py create mode 100644 tests/test_io_yaml.py diff --git a/src/raven_python/io/__init__.py b/src/raven_python/io/__init__.py new file mode 100644 index 0000000..bc70511 --- /dev/null +++ b/src/raven_python/io/__init__.py @@ -0,0 +1,15 @@ +"""RAVEN-specific I/O: YAML (cobra + Metabolic Atlas / Human-GEM extensions), SIF, +Excel export, and the Standard-GEM ``model//…`` git layout. +""" +from raven_python.io.excel import export_to_excel +from raven_python.io.git import export_for_git +from raven_python.io.sif import export_model_to_sif +from raven_python.io.yaml import read_yaml_model, write_yaml_model + +__all__ = [ + "export_for_git", + "export_model_to_sif", + "export_to_excel", + "read_yaml_model", + "write_yaml_model", +] diff --git a/src/raven_python/io/sif.py b/src/raven_python/io/sif.py new file mode 100644 index 0000000..9e73efa --- /dev/null +++ b/src/raven_python/io/sif.py @@ -0,0 +1,96 @@ +"""Export a model to Cytoscape SIF (Simple Interaction Format). + +Three graph types are supported: + +* ``"rc"`` reaction–compound: each reaction linked to its metabolites; +* ``"rr"`` reaction–reaction: reactions linked when they share a metabolite; +* ``"cc"`` compound–compound: each substrate linked to the products of the + reactions it feeds (computed on an irreversible copy, as RAVEN does, to avoid + spurious double links from reversible reactions). + +A SIF line is ``source graph_type target1 target2 ...``. +""" +from __future__ import annotations + +import warnings +from collections import Counter +from collections.abc import Mapping +from pathlib import Path + +import cobra + +from raven_python.manipulation.irreversible import convert_to_irreversible + +_GRAPH_TYPES = ("rc", "rr", "cc") + + +def _edges(model, graph_type): + """Yield (source_object, [target_objects]) per the graph type.""" + if graph_type == "rc": + for rxn in model.reactions: + yield rxn, list(rxn.metabolites) + elif graph_type == "rr": + for rxn in model.reactions: + neighbours = {r for met in rxn.metabolites for r in met.reactions} + neighbours.discard(rxn) + yield rxn, list(neighbours) + else: # cc — on an irreversible copy + irrev = model.copy() + convert_to_irreversible(irrev) + for met in irrev.metabolites: + products: set = set() + for rxn in met.reactions: + if rxn.get_coefficient(met) < 0: # met is a substrate here + products.update(m for m, c in rxn.metabolites.items() if c > 0) + yield met, list(products) + + +def export_model_to_sif( + model: cobra.Model, + path: str | Path, + graph_type: str = "rc", + *, + reaction_labels: Mapping[str, str] | None = None, + metabolite_labels: Mapping[str, str] | None = None, +) -> None: + """Write ``model`` to a Cytoscape SIF file. + + Parameters + ---------- + graph_type + ``"rc"`` (reaction–compound, default), ``"rr"`` (reaction–reaction), or + ``"cc"`` (compound–compound). + reaction_labels, metabolite_labels + Optional ``{id: label}`` maps overriding the node labels (default: IDs). + """ + if graph_type not in _GRAPH_TYPES: + raise ValueError(f"graph_type must be one of {_GRAPH_TYPES}, got {graph_type!r}") + + rlabels = reaction_labels or {} + mlabels = metabolite_labels or {} + + # Warn when the label maps collapse multiple distinct ids onto the same + # label: target-side dedup runs on labels, so the collision silently merges + # two nodes into one edge. Only check the ids actually mapped (cobra default + # labels are ids, which can't collide). + for kind, lmap in (("reaction", rlabels), ("metabolite", mlabels)): + duplicates = [lab for lab, n in Counter(lmap.values()).items() if n > 1] + if duplicates: + warnings.warn( + f"{kind}_labels maps multiple ids to the same label(s) " + f"({duplicates[:5]}{'…' if len(duplicates) > 5 else ''}); " + "SIF nodes are keyed by label, so those nodes will collapse.", + stacklevel=2, + ) + + def label(obj) -> str: + if isinstance(obj, cobra.Reaction): + return rlabels.get(obj.id, obj.id) + return mlabels.get(obj.id, obj.id) + + with open(path, "w", encoding="utf-8") as handle: + for source, targets in _edges(model, graph_type): + src = label(source) + names = sorted({label(t) for t in targets} - {src}) + if names: + handle.write(f"{src}\t{graph_type}\t" + "\t".join(names) + "\n") diff --git a/src/raven_python/io/yaml.py b/src/raven_python/io/yaml.py new file mode 100644 index 0000000..151954b --- /dev/null +++ b/src/raven_python/io/yaml.py @@ -0,0 +1,191 @@ +"""Read and write RAVEN/cobrapy YAML models. + +Aligned to RAVEN ``writeYAMLmodel.m`` / ``readYAMLmodel.m`` as of the +``feat/geckopy-compat-yaml`` work (commit fa281a1), whose writer emits **cobra's +native ``!!omap`` YAML**. Because the format *is* cobra's, the standard model +content — id, name, compartments, and per-entry id/name/compartment/formula/ +charge/bounds/gene_reaction_rule/objective_coefficient/subsystem/metabolites and +the whole ``annotation`` block (which carries ``smiles`` for metabolites, +``ec-code`` for reactions, and all MIRIAM cross-references) — is read and written +by ``cobra.io`` directly. + +This module only handles what cobra drops or mishandles: + +* **RAVEN-only top-level per-entry keys** that cobra ignores: ``inchis``, + ``deltaG``, ``metFrom`` and the free-text ``notes`` (metNotes) on metabolites; + ``confidence_score``, ``references``, ``rxnFrom``, ``deltaG`` and ``notes`` + (rxnNotes) on reactions; ``protein`` on genes. These are stashed in the cobra + object's ``.notes`` dict on read and lifted back to top-level keys on write. +* **Model-level extras** cobra ignores: ``version``, the ``metaData`` provenance + block, and the GECKO sections (``gecko_light``/``ec-rxns``/``ec-enzymes``), + preserved on ``model.notes`` for round-tripping. + +The reader also accepts the older RAVEN files (id/name nested in ``metaData``). +""" +from __future__ import annotations + +import gzip +from collections import OrderedDict +from pathlib import Path + +import cobra +from cobra.io.dict import model_from_dict, model_to_dict +from cobra.io.yaml import yaml as _cobra_yaml # ruamel round-trip YAML (handles !!omap) + + +def _open_text(path: str | Path, mode: str): + """Open ``path`` as a text handle, transparently gzipping when it ends ``.gz``.""" + if str(path).endswith(".gz"): + return gzip.open(path, f"{mode}t", encoding="utf-8") + return open(path, mode, encoding="utf-8") + +# RAVEN-only top-level per-entry keys -> the key used inside the cobra object's +# .notes dict. ('notes' is RAVEN's free-text metNotes/rxnNotes; stored under +# 'note' to avoid colliding with the notes container itself.) +_MET_FIELDS = (("inchis", "inchis"), ("deltaG", "deltaG"), ("metFrom", "metFrom"), ("notes", "note")) +_RXN_FIELDS = ( + ("confidence_score", "confidence_score"), + ("references", "references"), + ("rxnFrom", "rxnFrom"), + ("deltaG", "deltaG"), + ("notes", "note"), +) +_GENE_FIELDS = (("protein", "protein"),) + +_COBRA_TOP_KEYS = frozenset({"metabolites", "reactions", "genes", "compartments", "id", "name"}) + + +def _to_plain(obj): + if isinstance(obj, dict): + return {str(k): _to_plain(v) for k, v in obj.items()} + if isinstance(obj, (list, tuple)): + return [_to_plain(v) for v in obj] + if isinstance(obj, bool) or obj is None: + return obj + if isinstance(obj, int): + return int(obj) + if isinstance(obj, float): + return float(obj) + return obj if isinstance(obj, str) else str(obj) + + +def _capture_entry_fields(entries, fields): + """Pop RAVEN-only top-level keys off each entry into a parallel notes dict. + + Returns a list of ``{notes_key: value}`` dicts aligned with ``entries`` (so + cobra never sees these keys), to be attached to the built objects afterwards. + """ + captured = [] + for entry in entries: + notes = {} + for yaml_key, notes_key in fields: + if yaml_key in entry: + notes[notes_key] = entry.pop(yaml_key) + captured.append(notes) + return captured + + +def read_yaml_model(path: str | Path) -> cobra.Model: + """Read a RAVEN/cobrapy YAML model into a ``cobra.Model``.""" + with _open_text(path, "r") as handle: + raw = _to_plain(_cobra_yaml.load(handle)) + + if not isinstance(raw, dict): + raise ValueError(f"{path}: top-level YAML is a {type(raw).__name__}, not a mapping.") + + metadata = raw.pop("metaData", None) or {} + version = raw.pop("version", None) + foreign = {k: raw.pop(k) for k in list(raw) if k not in _COBRA_TOP_KEYS} + + met_notes = _capture_entry_fields(raw.get("metabolites", []), _MET_FIELDS) + rxn_notes = _capture_entry_fields(raw.get("reactions", []), _RXN_FIELDS) + gene_notes = _capture_entry_fields(raw.get("genes", []), _GENE_FIELDS) + + model = model_from_dict(raw) + + for met, notes in zip(model.metabolites, met_notes, strict=False): + met.notes = notes + for rxn, notes in zip(model.reactions, rxn_notes, strict=False): + rxn.notes = notes + for gene, notes in zip(model.genes, gene_notes, strict=False): + gene.notes = notes + + # Legacy files keep id/name inside metaData; restore them if cobra found none. + if metadata.get("id") and not model.id: + model.id = metadata["id"] + if metadata.get("name") and not model.name: + model.name = metadata["name"] + if metadata: + model.notes["metaData"] = metadata + if version is not None: + model.notes["version"] = version + if foreign: + model.notes["_yaml_sections"] = foreign + + return model + + +def _emit_entry_fields(entries, fields): + """Lift RAVEN-only keys out of each entry's ``notes`` dict to top level.""" + for entry in entries: + notes = entry.pop("notes", None) + if not isinstance(notes, dict): + continue + notes = dict(notes) + for yaml_key, notes_key in fields: + if notes_key in notes: + entry[yaml_key] = notes.pop(notes_key) + # Preserve any remaining (non-RAVEN) notes. The RAVEN free-text note is lifted + # to the YAML key "notes"; if leftovers also exist, merge them with it under + # that key (rather than silently dropping the leftovers). + if notes: + if "notes" in entry: + notes["note"] = entry["notes"] + entry["notes"] = notes + + +def write_yaml_model( + model: cobra.Model, path: str | Path, *, sort_ids: bool = False +) -> None: + """Write a ``cobra.Model`` to RAVEN/cobrapy (``!!omap``) YAML. + + With ``sort_ids=True`` metabolites/reactions/genes/compartments are written + in alphabetical order (diff-friendly), without modifying ``model``. + """ + model_notes = dict(model.notes or {}) + stored_meta = model_notes.pop("metaData", None) or {} + version = model_notes.pop("version", None) + foreign = model_notes.pop("_yaml_sections", None) or {} + + doc = OrderedDict(_to_plain(model_to_dict(model))) + + if sort_ids: + for section in ("metabolites", "reactions", "genes"): + if section in doc: + doc[section] = sorted(doc[section], key=lambda e: e.get("id", "")) + if isinstance(doc.get("compartments"), dict): + doc["compartments"] = dict(sorted(doc["compartments"].items())) + + _emit_entry_fields(doc.get("metabolites", []), _MET_FIELDS) + _emit_entry_fields(doc.get("reactions", []), _RXN_FIELDS) + _emit_entry_fields(doc.get("genes", []), _GENE_FIELDS) + + # cobra dict order is metabolites, reactions, genes, id, name, compartments; + # append version / gecko_light / metaData / ec-* like RAVEN's writer. + if version is not None: + doc["version"] = version + metadata = dict(stored_meta) + if model.id: + metadata.setdefault("id", model.id) + if model.name: + metadata.setdefault("name", model.name) + for key in ("gecko_light",): + if key in foreign: + doc[key] = foreign.pop(key) + if metadata: + doc["metaData"] = metadata + for key, value in foreign.items(): + doc[key] = value + + with _open_text(path, "w") as handle: + _cobra_yaml.dump(doc, handle) diff --git a/tests/test_io_sif.py b/tests/test_io_sif.py new file mode 100644 index 0000000..d50ad98 --- /dev/null +++ b/tests/test_io_sif.py @@ -0,0 +1,82 @@ +"""Tests for raven_python.io.sif (exportModelToSIF port).""" +import cobra +import pytest + +from raven_python.io import export_model_to_sif +from raven_python.manipulation import add_reactions_from_equations + + +@pytest.fixture +def model(): + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite(x, compartment="c") for x in ("a", "b", "c")]) + add_reactions_from_equations( + m, + [ + {"id": "R1", "equation": "a --> b"}, + {"id": "R2", "equation": "b --> c"}, + ], + ) + return m + + +def _lines(path): + return [ln.split("\t") for ln in path.read_text().splitlines()] + + +def test_reaction_compound(model, tmp_path): + out = tmp_path / "g.sif" + export_model_to_sif(model, out, "rc") + rows = {r[0]: (r[1], set(r[2:])) for r in _lines(out)} + assert rows["R1"] == ("rc", {"a", "b"}) + assert rows["R2"] == ("rc", {"b", "c"}) + + +def test_reaction_reaction(model, tmp_path): + out = tmp_path / "g.sif" + export_model_to_sif(model, out, "rr") + rows = {r[0]: set(r[2:]) for r in _lines(out)} + # R1 and R2 share metabolite b + assert rows["R1"] == {"R2"} + assert rows["R2"] == {"R1"} + + +def test_compound_compound(model, tmp_path): + out = tmp_path / "g.sif" + export_model_to_sif(model, out, "cc") + rows = {r[0]: set(r[2:]) for r in _lines(out)} + # a is a substrate of R1 (a->b): a links to product b + assert "b" in rows.get("a", set()) + # b is substrate of R2 (b->c): b links to c + assert "c" in rows.get("b", set()) + + +def test_custom_labels(model, tmp_path): + out = tmp_path / "g.sif" + export_model_to_sif(model, out, "rc", reaction_labels={"R1": "Reaction1"}) + sources = {r[0] for r in _lines(out)} + assert "Reaction1" in sources + assert "R1" not in sources + + +def test_bad_graph_type(model, tmp_path): + with pytest.raises(ValueError, match="graph_type"): + export_model_to_sif(model, tmp_path / "g.sif", "xx") + + +def test_cc_does_not_mutate_input(model, tmp_path): + n_before = len(model.reactions) + export_model_to_sif(model, tmp_path / "g.sif", "cc") + assert len(model.reactions) == n_before # convert_to_irreversible ran on a copy + + +# --- regression: label-map collision (known_issues.md B4) ------------------ + +def test_collapsing_label_map_warns(model, tmp_path): + """A label map that sends two distinct ids to the same label silently merges + nodes during the target-side dedup. Now warns so the user sees it.""" + with pytest.warns(UserWarning, match="multiple ids to the same label"): + export_model_to_sif( + model, tmp_path / "g.sif", "rc", + reaction_labels={"R1": "shared", "R2": "shared"}, + ) diff --git a/tests/test_io_yaml.py b/tests/test_io_yaml.py new file mode 100644 index 0000000..510af5f --- /dev/null +++ b/tests/test_io_yaml.py @@ -0,0 +1,202 @@ +"""Tests for raven_python.io.yaml against the RAVEN fa281a1 (cobra-native !!omap) schema.""" +from pathlib import Path + +import cobra +import pytest +from cobra.io.yaml import yaml as cobra_yaml + +from raven_python.io import read_yaml_model, write_yaml_model + +# A model laid out exactly as RAVEN writeYAMLmodel (fa281a1) emits: cobra-native +# structure, RAVEN-only fields as top-level per-entry keys, smiles/ec-code inside +# the annotation block, metaData provenance-only, id/name/version top-level. +RAVEN_DOC = { + "metabolites": [ + { + "id": "s_0001", + "name": "ATP", + "compartment": "c", + "formula": "C10H16N5O13P3", + "charge": -4, + "inchis": "InChI=1S/CH4", + "deltaG": 12.5, + "notes": "a metabolite note", + "metFrom": "KEGG", + "annotation": {"kegg.compound": ["C00002"], "smiles": ["C1=NC2"]}, + }, + {"id": "s_0002", "name": "ADP", "compartment": "c"}, + ], + "reactions": [ + { + "id": "R1", + "name": "rxn one", + "metabolites": {"s_0001": -1, "s_0002": 1}, + "lower_bound": -1000.0, + "upper_bound": 1000.0, + "gene_reaction_rule": "G1", + "subsystem": "glycolysis", + "confidence_score": 2, + "references": "PMID:123", + "rxnFrom": "manual", + "notes": "a reaction note", + "deltaG": -5.0, + "annotation": {"ec-code": ["1.1.1.1"]}, + } + ], + "genes": [ + {"id": "G1", "name": "gene one", "protein": "P12345", "annotation": {"uniprot": ["P12345"]}} + ], + "id": "testModel", + "name": "Test Model", + "compartments": {"c": "cytoplasm"}, + "version": "1.0", + "metaData": {"date": "2026-05-23", "taxonomy": "taxonomy/559292", "defaultLB": "-1000"}, + "ec-rxns": [{"id": "R1", "kcat": 100.0}], +} + + +@pytest.fixture +def yaml_file(tmp_path) -> Path: + p = tmp_path / "model.yml" + with open(p, "w", encoding="utf-8") as fh: + cobra_yaml.dump(RAVEN_DOC, fh) + return p + + +def test_standard_content(yaml_file): + model = read_yaml_model(yaml_file) + assert model.id == "testModel" + assert model.name == "Test Model" + assert {m.id for m in model.metabolites} == {"s_0001", "s_0002"} + r = model.reactions.get_by_id("R1") + assert r.bounds == (-1000.0, 1000.0) + assert r.subsystem == "glycolysis" + assert r.gene_reaction_rule == "G1" + + +def test_annotation_owned_by_cobra(yaml_file): + # smiles / ec-code / miriam live in the annotation block (cobra reads them) + model = read_yaml_model(yaml_file) + assert model.metabolites.get_by_id("s_0001").annotation["smiles"] == ["C1=NC2"] + assert model.metabolites.get_by_id("s_0001").annotation["kegg.compound"] == ["C00002"] + assert model.reactions.get_by_id("R1").annotation["ec-code"] == ["1.1.1.1"] + assert model.genes.get_by_id("G1").annotation["uniprot"] == ["P12345"] + + +def test_raven_only_fields_captured(yaml_file): + model = read_yaml_model(yaml_file) + a = model.metabolites.get_by_id("s_0001") + assert a.notes["inchis"] == "InChI=1S/CH4" + assert a.notes["deltaG"] == 12.5 + assert a.notes["note"] == "a metabolite note" # RAVEN metNotes string, no crash + assert a.notes["metFrom"] == "KEGG" + assert "smiles" not in a.notes # smiles stays in annotation + r = model.reactions.get_by_id("R1") + assert r.notes["confidence_score"] == 2 + assert r.notes["references"] == "PMID:123" + assert r.notes["rxnFrom"] == "manual" + assert r.notes["note"] == "a reaction note" + assert r.notes["deltaG"] == -5.0 + assert model.genes.get_by_id("G1").notes["protein"] == "P12345" + + +def test_model_level_extras(yaml_file): + model = read_yaml_model(yaml_file) + assert model.notes["metaData"]["taxonomy"] == "taxonomy/559292" + assert model.notes["version"] == "1.0" + assert model.notes["_yaml_sections"]["ec-rxns"][0]["kcat"] == 100.0 + + +def test_round_trip(yaml_file, tmp_path): + model = read_yaml_model(yaml_file) + out = tmp_path / "out.yml" + write_yaml_model(model, out) + reloaded = read_yaml_model(out) + + assert reloaded.id == "testModel" + assert reloaded.notes["version"] == "1.0" + assert reloaded.notes["metaData"]["taxonomy"] == "taxonomy/559292" + a = reloaded.metabolites.get_by_id("s_0001") + assert a.notes["deltaG"] == 12.5 + assert a.notes["note"] == "a metabolite note" + assert a.annotation["smiles"] == ["C1=NC2"] + r = reloaded.reactions.get_by_id("R1") + assert r.notes["confidence_score"] == 2 + assert reloaded.genes.get_by_id("G1").notes["protein"] == "P12345" + assert reloaded.notes["_yaml_sections"]["ec-rxns"][0]["id"] == "R1" + + +def test_extra_notes_not_dropped_when_free_text_note_present(yaml_file, tmp_path): + """An entry with both a RAVEN free-text note and an extra note keeps both on write.""" + model = read_yaml_model(yaml_file) + a = model.metabolites.get_by_id("s_0001") + a.notes["note"] = "free text" + a.notes["custom"] = "extra value" # a non-RAVEN note that must not be silently lost + out = tmp_path / "out.yml" + write_yaml_model(model, out) + text = out.read_text() + assert "extra value" in text # the leftover note survives serialization + + +def test_gzipped_round_trip(yaml_file, tmp_path): + # A .yml.gz path is transparently gzipped on write and read. + model = read_yaml_model(yaml_file) + out = tmp_path / "out.yml.gz" + write_yaml_model(model, out) + assert out.read_bytes()[:2] == b"\x1f\x8b" # gzip magic + reloaded = read_yaml_model(out) + assert reloaded.id == "testModel" + assert {m.id for m in reloaded.metabolites} == {"s_0001", "s_0002"} + + +def test_output_is_cobra_readable(yaml_file, tmp_path): + # The written file must load with stock cobra (it's cobra's native format). + model = read_yaml_model(yaml_file) + out = tmp_path / "out.yml" + write_yaml_model(model, out) + cobra_model = cobra.io.load_yaml_model(str(out)) + assert cobra_model.id == "testModel" + assert {m.id for m in cobra_model.metabolites} == {"s_0001", "s_0002"} + # RAVEN-only fields land in cobra notes; smiles in annotation + assert cobra_model.metabolites.get_by_id("s_0001").annotation["smiles"] == ["C1=NC2"] + + +def test_write_emits_raven_top_level_keys(yaml_file, tmp_path): + model = read_yaml_model(yaml_file) + out = tmp_path / "out.yml" + write_yaml_model(model, out) + text = out.read_text() + # RAVEN-only fields are lifted back to top-level entry keys, not buried in notes + assert "inchis:" in text + assert "deltaG:" in text + assert "confidence_score:" in text + assert "metaData:" in text + + +def test_legacy_id_in_metadata(tmp_path): + # Older RAVEN files nest id/name under metaData and have no top-level id. + legacy = { + "metabolites": [{"id": "a_c", "name": "A", "compartment": "c"}], + "reactions": [], + "genes": [], + "compartments": {"c": "cyt"}, + "metaData": {"id": "legacyModel", "name": "Legacy"}, + } + p = tmp_path / "legacy.yml" + with open(p, "w", encoding="utf-8") as fh: + cobra_yaml.dump(legacy, fh) + model = read_yaml_model(p) + assert model.id == "legacyModel" + assert model.name == "Legacy" + + +# Optional smoke test against a real model file if present. +_YEAST = Path("/home/eduardk/github/GECKO/tutorials/full_ecModel/models/yeast-GEM.yml") + + +@pytest.mark.skipif(not _YEAST.exists(), reason="real yeast-GEM.yml not available") +def test_real_yeast_gem_loads(): + model = read_yaml_model(_YEAST) + assert len(model.reactions) > 1000 + # legacy file: identity comes from metaData + assert model.id From 7a9b69accd9965c7f365e3d6a680fa2cc9fe293e Mon Sep 17 00:00:00 2001 From: Eduard Kerkhoven Date: Fri, 29 May 2026 22:51:36 +0200 Subject: [PATCH 7/7] Add Excel export and the Standard-GEM git-layout export --- src/raven_python/io/excel.py | 136 +++++++++++++++++++++++++++++++++++ src/raven_python/io/git.py | 106 +++++++++++++++++++++++++++ tests/test_io_excel.py | 111 ++++++++++++++++++++++++++++ tests/test_io_git.py | 69 ++++++++++++++++++ 4 files changed, 422 insertions(+) create mode 100644 src/raven_python/io/excel.py create mode 100644 src/raven_python/io/git.py create mode 100644 tests/test_io_excel.py create mode 100644 tests/test_io_git.py diff --git a/src/raven_python/io/excel.py b/src/raven_python/io/excel.py new file mode 100644 index 0000000..cf6196e --- /dev/null +++ b/src/raven_python/io/excel.py @@ -0,0 +1,136 @@ +"""Export a model to the RAVEN Microsoft Excel format. + +Writes the five-sheet RAVEN xlsx layout — RXNS, METS, COMPS, GENES, MODEL — pulling +RAVEN-specific values back out of cobra's ``annotation`` / ``notes`` (where the +raven_python YAML reader stashes them). Excel *import* is intentionally not provided. + +Requires the optional ``openpyxl`` dependency (``pip install raven_python[excel]``). +""" +from __future__ import annotations + +from pathlib import Path + +import cobra + + +def _miriam_string(annotation: dict, exclude: tuple[str, ...] = ()) -> str: + """RAVEN MIRIAM column: ``namespace/id;namespace/id2;...`` (sorted).""" + parts = [] + for namespace in sorted(annotation): + if namespace in exclude: + continue + values = annotation[namespace] + if isinstance(values, str): + values = [values] + parts.extend(f"{namespace}/{value}" for value in values) + return ";".join(parts) + + +def _equation(rxn: cobra.Reaction) -> str: + """Human-readable equation in RAVEN ``name[comp]`` form.""" + + def side(items): + return " + ".join( + f"{abs(coef):g} {met.name}[{met.compartment}]" for met, coef in items + ) + + reactants = [(m, c) for m, c in rxn.metabolites.items() if c < 0] + products = [(m, c) for m, c in rxn.metabolites.items() if c > 0] + arrow = " <=> " if rxn.reversibility else " => " + return f"{side(reactants)}{arrow}{side(products)}" + + +def _ec_codes(rxn: cobra.Reaction) -> str: + codes = rxn.annotation.get("ec-code", []) + if isinstance(codes, str): + codes = [codes] + return ";".join(codes) + + +def export_to_excel( + model: cobra.Model, path: str | Path, *, sort_ids: bool = False +) -> None: + """Write ``model`` to a RAVEN-format ``.xlsx`` file. + + Parameters + ---------- + sort_ids + If True, write reactions/metabolites/genes sorted alphabetically by ID + (the model itself is not modified). + """ + try: + from openpyxl import Workbook + except ImportError as exc: # pragma: no cover - exercised only without openpyxl + raise ImportError( + "export_to_excel requires openpyxl. Install it with " + "`pip install raven_python[excel]` (or `pip install openpyxl`)." + ) from exc + + reactions = sorted(model.reactions, key=lambda r: r.id) if sort_ids else list(model.reactions) + metabolites = ( + sorted(model.metabolites, key=lambda m: m.id) if sort_ids else list(model.metabolites) + ) + genes = sorted(model.genes, key=lambda g: g.id) if sort_ids else list(model.genes) + metadata = dict(model.notes.get("metaData", {})) if model.notes else {} + + wb = Workbook() + wb.remove(wb.active) # drop the default empty sheet + + # --- RXNS --- + ws = wb.create_sheet("RXNS") + ws.append( + ["#", "ID", "NAME", "EQUATION", "EC-NUMBER", "GENE ASSOCIATION", "LOWER BOUND", + "UPPER BOUND", "OBJECTIVE", "COMPARTMENT", "MIRIAM", "SUBSYSTEM", + "REPLACEMENT ID", "NOTE", "REFERENCE", "CONFIDENCE SCORE"] + ) + for r in reactions: + subsystem = r.subsystem + if isinstance(subsystem, (list, tuple)): + subsystem = ";".join(subsystem) + ws.append([ + None, r.id, r.name, _equation(r), _ec_codes(r), r.gene_reaction_rule, + r.lower_bound, r.upper_bound, + r.objective_coefficient or None, None, + _miriam_string(r.annotation, exclude=("ec-code",)), subsystem, None, + r.notes.get("note"), r.notes.get("references"), r.notes.get("confidence_score"), + ]) + + # --- METS --- + ws = wb.create_sheet("METS") + ws.append(["#", "ID", "NAME", "UNCONSTRAINED", "MIRIAM", "COMPOSITION", "InChI", + "COMPARTMENT", "REPLACEMENT ID", "CHARGE"]) + for m in metabolites: + inchi = m.notes.get("inchis") + ws.append([ + None, f"{m.name}[{m.compartment}]", m.name, None, + _miriam_string(m.annotation, exclude=("smiles",)), + None if inchi else m.formula, inchi, m.compartment, m.id, m.charge, + ]) + + # --- COMPS --- + ws = wb.create_sheet("COMPS") + ws.append(["#", "ABBREVIATION", "NAME", "INSIDE", "MIRIAM"]) + comps = sorted(model.compartments) if sort_ids else list(model.compartments) + for cid in comps: + ws.append([None, cid, model.compartments.get(cid, ""), None, None]) + + # --- GENES --- + if genes: + ws = wb.create_sheet("GENES") + ws.append(["#", "NAME", "MIRIAM", "SHORT NAME", "COMPARTMENT"]) + for g in genes: + ws.append([None, g.id, _miriam_string(g.annotation), g.name, None]) + + # --- MODEL --- + ws = wb.create_sheet("MODEL") + ws.append(["#", "ID", "NAME", "TAXONOMY", "DEFAULT LOWER", "DEFAULT UPPER", + "CONTACT GIVEN NAME", "CONTACT FAMILY NAME", "CONTACT EMAIL", + "ORGANIZATION", "NOTES"]) + ws.append([ + None, model.id or "blankID", model.name or "blankName", + metadata.get("taxonomy"), metadata.get("defaultLB"), metadata.get("defaultUB"), + metadata.get("givenName"), metadata.get("familyName"), metadata.get("email"), + metadata.get("organization"), metadata.get("note"), + ]) + + wb.save(str(path)) diff --git a/src/raven_python/io/git.py b/src/raven_python/io/git.py new file mode 100644 index 0000000..80bf8e8 --- /dev/null +++ b/src/raven_python/io/git.py @@ -0,0 +1,106 @@ +"""Export a model into a Standard-GEM versioned-repository layout. + +Writes the model in several formats into the Standard-GEM folder structure (a +``model/`` directory with one subfolder per format), ready to commit to a +Git-maintained model repository (Metabolic Atlas / Human-GEM / yeast-GEM style), +plus a ``dependencies.txt`` recording tool versions. + +Thin orchestration over the writers raven_python already exposes: ``write_yaml_model``, +cobra's ``write_sbml_model`` and ``save_matlab_model``, ``export_to_excel``, plus a +single-file reaction table (txt). +""" +from __future__ import annotations + +import importlib.metadata as _md +import platform +from collections.abc import Iterable +from pathlib import Path + +import cobra + +from raven_python.io.excel import _equation, export_to_excel +from raven_python.io.yaml import write_yaml_model +from raven_python.utils.sort import sort_identifiers + +_ALL_FORMATS = ("yml", "xml", "mat", "xlsx", "txt") + + +def _version(package: str) -> str: + try: + return _md.version(package) + except _md.PackageNotFoundError: + return "unknown" + + +def _write_txt(model: cobra.Model, path: Path) -> None: + """Single-file, human-readable reaction table (RAVEN exportForGit txt).""" + with open(path, "w", encoding="utf-8") as fh: + fh.write("Rxn name\tFormula\tGene-reaction association\tLB\tUB\tObjective\n") + for r in model.reactions: + fh.write( + f"{r.id}\t{_equation(r)}\t{r.gene_reaction_rule}\t" + f"{r.lower_bound:g}\t{r.upper_bound:g}\t{r.objective_coefficient:g}\n" + ) + + +def export_for_git( + model: cobra.Model, + path: str | Path = ".", + *, + prefix: str = "model", + formats: Iterable[str] = ("yml", "xml", "mat", "xlsx"), + sub_dirs: bool = True, +) -> Path: + """Write ``model`` into a Standard-GEM repository layout. + + Parameters + ---------- + path + Directory to populate. + prefix + Base filename for every format (default ``"model"``). + formats + Which formats to write; any of ``"yml"``, ``"xml"``, ``"mat"``, + ``"xlsx"``, ``"txt"`` (default ``yml``/``xml``/``mat``/``xlsx``). + sub_dirs + If True (default), write ``model//.`` (standard-GEM + layout); otherwise all files go directly in ``path``. + + Returns + ------- + pathlib.Path + The root directory written to. + """ + formats = list(formats) + unknown = set(formats) - set(_ALL_FORMATS) + if unknown: + raise ValueError(f"Unknown format(s): {sorted(unknown)}; allowed: {_ALL_FORMATS}") + + # Sort a copy so the caller's model is untouched. + model = sort_identifiers(model.copy()) + + root = Path(path) / "model" if sub_dirs else Path(path) + root.mkdir(parents=True, exist_ok=True) + + def target(fmt: str) -> Path: + folder = root / fmt if sub_dirs else root + folder.mkdir(parents=True, exist_ok=True) + return folder / f"{prefix}.{fmt}" + + if "yml" in formats: + write_yaml_model(model, target("yml")) + if "xml" in formats: + cobra.io.write_sbml_model(model, str(target("xml"))) + if "mat" in formats: + cobra.io.save_matlab_model(model, str(target("mat"))) + if "xlsx" in formats: + export_to_excel(model, target("xlsx")) + if "txt" in formats: + _write_txt(model, target("txt")) + + with open(root / "dependencies.txt", "w", encoding="utf-8") as fh: + fh.write(f"python\t{platform.python_version()}\n") + fh.write(f"cobra\t{_version('cobra')}\n") + fh.write(f"raven_python\t{_version('raven_python')}\n") + + return root diff --git a/tests/test_io_excel.py b/tests/test_io_excel.py new file mode 100644 index 0000000..12434ef --- /dev/null +++ b/tests/test_io_excel.py @@ -0,0 +1,111 @@ +"""Tests for raven_python.io.excel (exportToExcelFormat port, export only).""" +import cobra +import pytest + +openpyxl = pytest.importorskip("openpyxl") + +from raven_python.io import export_to_excel +from raven_python.manipulation import add_reactions_from_equations + + +@pytest.fixture +def model(): + m = cobra.Model("yeastGEM") + m.name = "Yeast" + m.compartments = {"c": "cytoplasm"} + m.notes["metaData"] = {"taxonomy": "taxonomy/559292", "defaultLB": "-1000"} + m.add_metabolites( + [ + cobra.Metabolite("atp_c", name="ATP", formula="C10H16N5O13P3", charge=-4, compartment="c"), + cobra.Metabolite("adp_c", name="ADP", compartment="c"), + ] + ) + m.metabolites.atp_c.annotation = {"kegg.compound": ["C00002"], "smiles": ["C1=NC"]} + m.metabolites.atp_c.notes = {"inchis": "InChI=1S/X"} + add_reactions_from_equations( + m, + [{"id": "R1", "equation": "atp_c <=> adp_c", "name": "rxn one", + "gene_reaction_rule": "G1", "subsystem": "glycolysis"}], + ) + r = m.reactions.R1 + r.annotation = {"ec-code": ["1.1.1.1"], "kegg.reaction": ["R00001"]} + r.notes = {"confidence_score": 2, "note": "a note", "references": "PMID:1"} + r.objective_coefficient = 1 + return m + + +def _wb(path): + return openpyxl.load_workbook(path) + + +def test_sheets_present(model, tmp_path): + out = tmp_path / "m.xlsx" + export_to_excel(model, out) + wb = _wb(out) + assert set(wb.sheetnames) == {"RXNS", "METS", "COMPS", "GENES", "MODEL"} + + +def test_rxns_sheet(model, tmp_path): + out = tmp_path / "m.xlsx" + export_to_excel(model, out) + ws = _wb(out)["RXNS"] + header = [c.value for c in ws[1]] + row = {header[i]: c.value for i, c in enumerate(ws[2])} + assert row["ID"] == "R1" + assert row["NAME"] == "rxn one" + assert "ATP[c]" in row["EQUATION"] and "<=>" in row["EQUATION"] + assert row["EC-NUMBER"] == "1.1.1.1" + assert row["GENE ASSOCIATION"] == "G1" + assert row["SUBSYSTEM"] == "glycolysis" + assert row["OBJECTIVE"] == 1 + assert row["CONFIDENCE SCORE"] == 2 + assert row["NOTE"] == "a note" + assert row["MIRIAM"] == "kegg.reaction/R00001" # ec-code excluded (own column) + + +def test_mets_sheet(model, tmp_path): + out = tmp_path / "m.xlsx" + export_to_excel(model, out) + ws = _wb(out)["METS"] + header = [c.value for c in ws[1]] + rows = { + r[header.index("REPLACEMENT ID")].value: {header[i]: c.value for i, c in enumerate(r)} + for r in ws.iter_rows(min_row=2) + } + atp = rows["atp_c"] + assert atp["ID"] == "ATP[c]" + assert atp["NAME"] == "ATP" + assert atp["InChI"] == "InChI=1S/X" + assert atp["COMPOSITION"] is None # suppressed when InChI present + assert atp["CHARGE"] == -4 + assert atp["MIRIAM"] == "kegg.compound/C00002" # smiles excluded + + +def test_model_sheet(model, tmp_path): + out = tmp_path / "m.xlsx" + export_to_excel(model, out) + ws = _wb(out)["MODEL"] + header = [c.value for c in ws[1]] + row = {header[i]: c.value for i, c in enumerate(ws[2])} + assert row["ID"] == "yeastGEM" + assert row["NAME"] == "Yeast" + assert row["TAXONOMY"] == "taxonomy/559292" + assert row["DEFAULT LOWER"] == "-1000" + + +def test_genes_sheet(model, tmp_path): + out = tmp_path / "m.xlsx" + export_to_excel(model, out) + ws = _wb(out)["GENES"] + header = [c.value for c in ws[1]] + row = {header[i]: c.value for i, c in enumerate(ws[2])} + assert row["NAME"] == "G1" + + +def test_no_genes_skips_sheet(tmp_path): + m = cobra.Model("t") + m.add_metabolites([cobra.Metabolite("a_c", compartment="c")]) + add_reactions_from_equations(m, [{"id": "R1", "equation": "a_c -->"}]) + out = tmp_path / "m.xlsx" + export_to_excel(m, out) + assert "GENES" not in _wb(out).sheetnames diff --git a/tests/test_io_git.py b/tests/test_io_git.py new file mode 100644 index 0000000..28881dc --- /dev/null +++ b/tests/test_io_git.py @@ -0,0 +1,69 @@ +"""Tests for raven_python.io.git (exportForGit port).""" +import cobra +import pytest + +from raven_python.io import export_for_git +from raven_python.manipulation import add_reactions_from_equations + + +@pytest.fixture +def model(): + m = cobra.Model("yeastGEM") + m.compartments = {"c": "cytoplasm"} + m.add_metabolites( + [cobra.Metabolite("atp_c", name="ATP", compartment="c"), + cobra.Metabolite("adp_c", name="ADP", compartment="c")] + ) + add_reactions_from_equations(m, [{"id": "R1", "equation": "atp_c <=> adp_c"}]) + return m + + +def test_standard_gem_layout(model, tmp_path): + root = export_for_git(model, tmp_path, prefix="yeast", formats=("yml", "xml", "mat", "xlsx", "txt")) + assert root == tmp_path / "model" + assert (root / "yml" / "yeast.yml").exists() + assert (root / "xml" / "yeast.xml").exists() + assert (root / "mat" / "yeast.mat").exists() + assert (root / "xlsx" / "yeast.xlsx").exists() + assert (root / "txt" / "yeast.txt").exists() + assert (root / "dependencies.txt").exists() + + +def test_dependencies_file(model, tmp_path): + root = export_for_git(model, tmp_path, formats=("yml",)) + deps = (root / "dependencies.txt").read_text() + assert "python\t" in deps + assert "cobra\t" in deps + assert "raven_python\t" in deps + + +def test_flat_layout(model, tmp_path): + root = export_for_git(model, tmp_path, formats=("yml",), sub_dirs=False) + assert root == tmp_path + assert (tmp_path / "model.yml").exists() + + +def test_subset_of_formats(model, tmp_path): + root = export_for_git(model, tmp_path, formats=("yml", "xml")) + assert (root / "yml" / "model.yml").exists() + assert not (root / "mat").exists() + assert not (root / "xlsx").exists() + + +def test_does_not_mutate_model(model, tmp_path): + order_before = [r.id for r in model.reactions] + export_for_git(model, tmp_path, formats=("yml",)) + assert [r.id for r in model.reactions] == order_before + + +def test_txt_table_content(model, tmp_path): + root = export_for_git(model, tmp_path, formats=("txt",)) + txt = (root / "txt" / "model.txt").read_text() + assert txt.splitlines()[0].startswith("Rxn name\t") + assert "R1" in txt + assert "ATP[c]" in txt + + +def test_bad_format(model, tmp_path): + with pytest.raises(ValueError, match="Unknown format"): + export_for_git(model, tmp_path, formats=("yml", "json"))