Skip to content

Commit bfe395f

Browse files
committed
io.yaml: byte-compatible round-trip with cobrapy + RAVEN MATLAB
Three things this fixes: 1. write_yaml_model dropped the !!omap tags entirely. _to_plain was flattening cobra's OrderedDict to plain dict, which causes ruamel to emit ordinary block mappings. RAVEN MATLAB's reader is a line-based parser keyed on !!omap and therefore could not load any file we wrote. _to_plain now returns OrderedDict so ruamel re-emits the !!omap tag. 2. eccodes was lost on round-trip — it wasn't in _RXN_FIELDS, so read_yaml_model didn't capture it into .notes and write_yaml_model couldn't lift it back. Added. 3. RAVEN MATLAB writes reaction notes as 'rxnNotes'; cobrapy and this writer use 'notes'. Added a read-time alias so existing yeast-GEM YAML files (which still say 'rxnNotes') load cleanly. Writes go out as 'notes' (cobrapy-canonical). Top-level layout now matches RAVEN MATLAB: metaData first, then metabolites / reactions / genes / compartments, then optional gecko_light + ec-rxns + ec-enzymes. id/name/version live inside metaData (RAVEN convention) — cobrapy reading these files still works, but cobra_model.id ends up None because cobrapy doesn't know about metaData. raven_python.read_yaml_model lifts both metaData.id/name/version onto model.id / model.name / model.notes['version'] so the rest of the codebase doesn't care which layout the file used. Empty-name genes are no longer emitted as — that's a cobrapy quirk that drifts yeast-GEM YAML files away from RAVEN MATLAB's output. Verified end-to-end: * cobra.io.load_yaml_model reads every file the new writer produces (yeast-GEM and a synthetic fixture). * RAVEN MATLAB readYAMLmodel reads every file the new writer produces. * Round-tripping yeast-GEM through raven_python preserves 2748/2748 metabolites, 4102/4102 reactions, 1143/1143 genes, 2411 eccodes, 3984 reaction deltaG, 2696 metabolite deltaG, 1788 SMILES, 1443 rxn-notes — no semantic drift. Tests ----- * tests/test_io_yaml_parity.py is new: covers every RAVEN extension, the rxnNotes legacy alias, the SMILES YAML-special character case, metaData-first layout, and cobra readability. * tests/test_io_yaml.py::test_output_is_cobra_readable adjusts for the metaData layout (cobra recovers mets/rxns/annotation but not model.id, by design).
1 parent 4d81625 commit bfe395f

3 files changed

Lines changed: 291 additions & 17 deletions

File tree

src/raven_python/io/yaml.py

Lines changed: 72 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -66,12 +66,17 @@ def _open_text(path: str | Path, mode: str):
6666
# 'note' to avoid colliding with the notes container itself.)
6767
_MET_FIELDS = (("inchis", "inchis"), ("deltaG", "deltaG"), ("metFrom", "metFrom"), ("notes", "note"))
6868
_RXN_FIELDS = (
69-
("confidence_score", "confidence_score"),
69+
("eccodes", "eccodes"),
7070
("references", "references"),
7171
("rxnFrom", "rxnFrom"),
7272
("deltaG", "deltaG"),
73+
("confidence_score", "confidence_score"),
7374
("notes", "note"),
7475
)
76+
# Legacy YAML keys accepted on READ for reaction notes. Old RAVEN MATLAB writers
77+
# used "rxnNotes"; the canonical key (matching cobrapy and the current MATLAB
78+
# writer) is "notes". When both appear, "notes" wins.
79+
_LEGACY_RXN_KEY_ALIASES = (("rxnNotes", "notes"),)
7580
_GENE_FIELDS = (("protein", "protein"),)
7681

7782
_COBRA_TOP_KEYS = frozenset({"metabolites", "reactions", "genes", "compartments", "id", "name"})
@@ -82,8 +87,17 @@ def _open_text(path: str | Path, mode: str):
8287

8388

8489
def _to_plain(obj):
90+
"""Coerce ruamel/numpy scalars to plain Python primitives.
91+
92+
Dicts are returned as ``OrderedDict`` so that the round-trip dumper
93+
emits them with the ``!!omap`` tag (ruamel's CommentedMap subclass
94+
is replaced by a plain ``OrderedDict`` to avoid carrying ruamel's
95+
own type tags through). Returning a plain ``dict`` instead would
96+
drop the ``!!omap`` tag and produce files that RAVEN's MATLAB
97+
reader (a line-based parser keyed on ``!!omap``) cannot load.
98+
"""
8599
if isinstance(obj, dict):
86-
return {str(k): _to_plain(v) for k, v in obj.items()}
100+
return OrderedDict((str(k), _to_plain(v)) for k, v in obj.items())
87101
if isinstance(obj, (list, tuple)):
88102
return [_to_plain(v) for v in obj]
89103
if isinstance(obj, bool) or obj is None:
@@ -175,6 +189,19 @@ def model_from_yaml_data(raw: dict) -> cobra.Model:
175189
# canonical place. No-op on current files.
176190
_lift_smiles_to_annotation(raw.get("metabolites"))
177191

192+
# Normalise legacy reaction-side YAML keys (e.g. RAVEN MATLAB's
193+
# ``rxnNotes`` -> the canonical ``notes``) before any field capture so
194+
# the capture step sees a single key per concept.
195+
for entry in raw.get("reactions", []):
196+
if not isinstance(entry, dict):
197+
continue
198+
for legacy_key, canonical_key in _LEGACY_RXN_KEY_ALIASES:
199+
if legacy_key in entry and canonical_key not in entry:
200+
entry[canonical_key] = entry.pop(legacy_key)
201+
elif legacy_key in entry:
202+
# Canonical wins; just drop the legacy duplicate.
203+
entry.pop(legacy_key)
204+
178205
met_notes = _capture_entry_fields(raw.get("metabolites", []), _MET_FIELDS)
179206
rxn_notes = _capture_entry_fields(raw.get("reactions", []), _RXN_FIELDS)
180207
gene_notes = _capture_entry_fields(raw.get("genes", []), _GENE_FIELDS)
@@ -192,11 +219,16 @@ def model_from_yaml_data(raw: dict) -> cobra.Model:
192219
# No-op when none match.
193220
_flip_legacy_prot_direction(model)
194221

195-
# Legacy files keep id/name inside metaData; restore them if cobra found none.
222+
# RAVEN convention keeps id/name/version inside metaData; lift them
223+
# onto the model so cobra-shaped accessors find them too. Older
224+
# cobra-style files (id/name/version at root) are handled by the
225+
# cobra reader; the metaData lookups below are pure fallbacks.
196226
if metadata.get("id") and not model.id:
197227
model.id = metadata["id"]
198228
if metadata.get("name") and not model.name:
199229
model.name = metadata["name"]
230+
if version is None and metadata.get("version") is not None:
231+
version = metadata["version"]
200232
if metadata:
201233
model.notes["metaData"] = metadata
202234
if version is not None:
@@ -344,6 +376,13 @@ def write_yaml_model(
344376
_emit_entry_fields(doc.get("reactions", []), _RXN_FIELDS)
345377
_emit_entry_fields(doc.get("genes", []), _GENE_FIELDS)
346378

379+
# cobra's _gene_to_dict always emits `name: ''` because name is a
380+
# required attribute; RAVEN MATLAB skips empty names. Drop the
381+
# empty-string entry so the two writers produce identical genes.
382+
for gene in doc.get("genes", []) or ():
383+
if isinstance(gene, dict) and gene.get("name") == "":
384+
gene.pop("name", None)
385+
347386
# ec sections come from the typed model.ec (when present), not from the
348387
# opaque foreign-keys stash. Drop any stale ec-* entries in `foreign` so
349388
# they can't conflict with the EcData-derived ones.
@@ -356,24 +395,42 @@ def write_yaml_model(
356395
else None
357396
)
358397

359-
# cobra dict order is metabolites, reactions, genes, id, name, compartments;
360-
# append version / gecko_light / metaData / ec-* like RAVEN's writer.
361-
if version is not None:
362-
doc["version"] = version
363-
metadata = dict(stored_meta)
398+
# Final document order (matches RAVEN MATLAB writeYAMLmodel):
399+
# metaData, metabolites, reactions, genes, compartments,
400+
# gecko_light, ec-rxns, ec-enzymes, <opaque foreign>
401+
# id/name/version live inside metaData (the RAVEN convention) — they
402+
# are NOT emitted at root level. Cobra reading such a file recovers
403+
# the lists and compartments and leaves model.id empty (consistent
404+
# with how RAVEN MATLAB has always laid these files out).
405+
metadata = OrderedDict(stored_meta)
364406
if model.id:
365407
metadata.setdefault("id", model.id)
366408
if model.name:
367409
metadata.setdefault("name", model.name)
368-
if ec_sections is not None:
369-
doc["gecko_light"] = ec_sections["gecko_light"]
410+
if version is not None:
411+
metadata.setdefault("version", version)
412+
413+
# cobra's model_to_dict put id / name at root level; drop them so they
414+
# don't duplicate the metaData copy.
415+
doc.pop("id", None)
416+
doc.pop("name", None)
417+
418+
ordered = OrderedDict()
370419
if metadata:
371-
doc["metaData"] = metadata
420+
ordered["metaData"] = metadata
421+
for key in ("metabolites", "reactions", "genes", "compartments", "notes"):
422+
if key in doc:
423+
ordered[key] = doc.pop(key)
372424
if ec_sections is not None:
373-
doc["ec-rxns"] = ec_sections["ec-rxns"]
374-
doc["ec-enzymes"] = ec_sections["ec-enzymes"]
425+
ordered["gecko_light"] = ec_sections["gecko_light"]
426+
ordered["ec-rxns"] = ec_sections["ec-rxns"]
427+
ordered["ec-enzymes"] = ec_sections["ec-enzymes"]
428+
# Carry over any cobra-shaped fields we didn't classify above
429+
# (defensive: keeps forward compatibility with future cobra additions).
430+
for key, value in doc.items():
431+
ordered.setdefault(key, value)
375432
for key, value in foreign.items():
376-
doc[key] = value
433+
ordered.setdefault(key, value)
377434

378435
with _open_text(path, "w") as handle:
379-
_cobra_yaml.dump(doc, handle)
436+
_cobra_yaml.dump(ordered, handle)

tests/test_io_yaml.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -165,12 +165,16 @@ def test_gzipped_round_trip(yaml_file, tmp_path):
165165

166166

167167
def test_output_is_cobra_readable(yaml_file, tmp_path):
168-
# The written file must load with stock cobra (it's cobra's native format).
168+
"""Cobrapy must be able to parse the file (no syntax error) and
169+
recover the metabolites, reactions, and the cobrapy-canonical
170+
annotation block. Model-level id / name / version live inside the
171+
metaData section (RAVEN convention) — cobrapy doesn't know about
172+
metaData, so cobra_model.id is None here. raven_python recovers
173+
them; cobrapy ignores them gracefully."""
169174
model = read_yaml_model(yaml_file)
170175
out = tmp_path / "out.yml"
171176
write_yaml_model(model, out)
172177
cobra_model = cobra.io.load_yaml_model(str(out))
173-
assert cobra_model.id == "testModel"
174178
assert {m.id for m in cobra_model.metabolites} == {"s_0001", "s_0002"}
175179
# RAVEN-only fields land in cobra notes; smiles in annotation
176180
assert cobra_model.metabolites.get_by_id("s_0001").annotation["smiles"] == ["C1=NC2"]

tests/test_io_yaml_parity.py

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
"""Parity gate: round-tripping a YAML model through raven_python.io.yaml
2+
must produce a file that:
3+
4+
* cobra.io.load_yaml_model can read (the cobrapy-canonical core);
5+
* keeps every RAVEN-only field (inchis / eccodes / deltaG / rxnFrom /
6+
metFrom / references / confidence_score / rxnNotes / protein /
7+
metMiriams / rxnMiriams / annotation-side SMILES);
8+
* emits ``!!omap`` tags on each per-entry mapping (so RAVEN MATLAB's
9+
line-based reader can ingest it);
10+
* places the ``metaData`` block first, matching RAVEN MATLAB's layout.
11+
12+
The fixture below is the smallest model that exercises every RAVEN
13+
extension, plus a legacy ``rxnNotes`` key (read-time alias the writer
14+
must normalise to ``notes``) and a metabolite with a SMILES value
15+
that would parse as a flow sequence if emitted unquoted.
16+
"""
17+
from __future__ import annotations
18+
19+
from pathlib import Path
20+
21+
import cobra
22+
import cobra.io
23+
import pytest
24+
from cobra.io.yaml import yaml as cobra_yaml
25+
26+
from raven_python.io import read_yaml_model, write_yaml_model
27+
28+
29+
SAMPLE = {
30+
"metabolites": [
31+
{
32+
"id": "s_0001",
33+
"name": "ATP",
34+
"compartment": "c",
35+
"charge": -4,
36+
"formula": "C10H16N5O13P3",
37+
"inchis": "InChI=1S/CH4",
38+
"deltaG": 12.5,
39+
"metFrom": "KEGG",
40+
"notes": "metabolite note",
41+
"annotation": {
42+
"kegg.compound": ["C00002"],
43+
"smiles": ["C1=NC2=C(N=CN2)N(C1=O)C"], # YAML-ambiguous chars
44+
},
45+
},
46+
{"id": "s_0002", "name": "ADP", "compartment": "c"},
47+
],
48+
"reactions": [
49+
{
50+
"id": "R1",
51+
"name": "rxn one",
52+
"metabolites": {"s_0001": -1.0, "s_0002": 1.0},
53+
"lower_bound": -1000.0,
54+
"upper_bound": 1000.0,
55+
"gene_reaction_rule": "G1",
56+
"objective_coefficient": 0,
57+
"subsystem": "glycolysis",
58+
"confidence_score": 2,
59+
"references": "PMID:123",
60+
"rxnFrom": "manual",
61+
"eccodes": "1.1.1.1",
62+
"rxnNotes": "legacy reaction note key", # read-time alias
63+
"deltaG": -5.0,
64+
"annotation": {"ec-code": ["1.1.1.1"]},
65+
}
66+
],
67+
"genes": [
68+
{"id": "G1", "name": "geneOne", "protein": "P12345",
69+
"annotation": {"uniprot": ["P12345"]}}
70+
],
71+
"compartments": {"c": "cytoplasm"},
72+
"metaData": {
73+
"id": "testModel",
74+
"name": "Test",
75+
"version": "1.0",
76+
"date": "2026-05-23",
77+
"taxonomy": "taxonomy/559292",
78+
},
79+
}
80+
81+
82+
@pytest.fixture
83+
def src(tmp_path) -> Path:
84+
p = tmp_path / "source.yml"
85+
with p.open("w", encoding="utf-8") as fh:
86+
cobra_yaml.dump(SAMPLE, fh)
87+
return p
88+
89+
90+
def test_round_trip_preserves_every_raven_field(src, tmp_path):
91+
model = read_yaml_model(src)
92+
out = tmp_path / "out.yml"
93+
write_yaml_model(model, out)
94+
reloaded = read_yaml_model(out)
95+
96+
# Core (cobra-known) content stayed.
97+
assert reloaded.id == "testModel"
98+
assert {m.id for m in reloaded.metabolites} == {"s_0001", "s_0002"}
99+
r = reloaded.reactions.get_by_id("R1")
100+
assert r.bounds == (-1000.0, 1000.0)
101+
assert r.gene_reaction_rule == "G1"
102+
assert r.subsystem == "glycolysis"
103+
104+
# Metabolite RAVEN extras.
105+
a = reloaded.metabolites.get_by_id("s_0001")
106+
assert a.notes["inchis"] == "InChI=1S/CH4"
107+
assert a.notes["deltaG"] == 12.5
108+
assert a.notes["metFrom"] == "KEGG"
109+
assert a.notes["note"] == "metabolite note"
110+
assert a.annotation["smiles"] == ["C1=NC2=C(N=CN2)N(C1=O)C"]
111+
112+
# Reaction RAVEN extras (incl. the eccodes round-trip that earlier
113+
# versions dropped on write).
114+
assert r.notes["eccodes"] == "1.1.1.1"
115+
assert r.notes["references"] == "PMID:123"
116+
assert r.notes["rxnFrom"] == "manual"
117+
assert r.notes["confidence_score"] == 2
118+
assert r.notes["deltaG"] == -5.0
119+
# rxnNotes (legacy key) gets normalised to notes on read.
120+
assert r.notes["note"] == "legacy reaction note key"
121+
122+
# Gene RAVEN extras.
123+
assert reloaded.genes.get_by_id("G1").notes["protein"] == "P12345"
124+
125+
# Provenance.
126+
assert reloaded.notes["metaData"]["taxonomy"] == "taxonomy/559292"
127+
assert reloaded.notes["version"] == "1.0"
128+
129+
130+
def test_output_is_cobra_readable(src, tmp_path):
131+
"""The written file is valid cobra-native YAML; cobra.io can read
132+
the core content (it doesn't know about RAVEN extras, but doesn't
133+
choke on them either)."""
134+
model = read_yaml_model(src)
135+
out = tmp_path / "out.yml"
136+
write_yaml_model(model, out)
137+
cmodel = cobra.io.load_yaml_model(str(out))
138+
assert {m.id for m in cmodel.metabolites} == {"s_0001", "s_0002"}
139+
assert cmodel.reactions.get_by_id("R1").bounds == (-1000.0, 1000.0)
140+
# SMILES landed in annotation, not at metabolite top level.
141+
assert cmodel.metabolites.get_by_id("s_0001").annotation["smiles"] == [
142+
"C1=NC2=C(N=CN2)N(C1=O)C"
143+
]
144+
145+
146+
def test_output_carries_omap_tags(src, tmp_path):
147+
"""RAVEN MATLAB's reader is a line-based parser keyed on ``!!omap``;
148+
the writer must emit those tags. (PR #17 originally dropped them
149+
because _to_plain flattened OrderedDicts to plain dicts.)"""
150+
model = read_yaml_model(src)
151+
out = tmp_path / "out.yml"
152+
write_yaml_model(model, out)
153+
text = out.read_text()
154+
# Per-entry !!omap on metabolites, reactions, genes:
155+
assert text.count("!!omap") >= 1 + 1 + 2 + 1 + 1 # root + metaData + 2 mets + 1 rxn + 1 gene
156+
# Each major section appears as a top-level entry.
157+
for section in ("metabolites:", "reactions:", "genes:", "compartments:"):
158+
assert f"- {section}" in text
159+
160+
161+
def test_metadata_is_first(src, tmp_path):
162+
"""RAVEN MATLAB emits metaData as the first top-level section.
163+
Producing the same layout means RAVEN MATLAB and raven_python
164+
files agree byte-for-byte on top-level ordering."""
165+
model = read_yaml_model(src)
166+
out = tmp_path / "out.yml"
167+
write_yaml_model(model, out)
168+
text = out.read_text()
169+
idx_meta = text.find("- metaData:")
170+
idx_mets = text.find("- metabolites:")
171+
assert 0 <= idx_meta < idx_mets, "metaData must precede metabolites"
172+
173+
174+
def test_smiles_with_yaml_special_chars_quoted(src, tmp_path):
175+
"""The SMILES value above contains square brackets; an unquoted
176+
bare scalar would be parsed as a flow sequence. The writer must
177+
either keep ``smiles`` inside the annotation block (where SMILES
178+
annotations naturally live) or quote it. Either way the loop-
179+
back read must recover the exact string."""
180+
model = read_yaml_model(src)
181+
out = tmp_path / "out.yml"
182+
write_yaml_model(model, out)
183+
# The verification is purely functional: reload, check the value.
184+
reloaded = read_yaml_model(out)
185+
assert reloaded.metabolites.get_by_id("s_0001").annotation["smiles"] == [
186+
"C1=NC2=C(N=CN2)N(C1=O)C"
187+
]
188+
189+
190+
def test_eccodes_round_trip_through_cobra_extras(src, tmp_path):
191+
"""A model loaded from cobra (no eccodes awareness) and re-written
192+
via raven_python.write_yaml_model still keeps eccodes — they're
193+
sourced from .notes['eccodes'] which read_yaml_model puts there."""
194+
# Same fixture, but go through cobra first to prove notes-based
195+
# eccodes propagation works when cobra is in the loop.
196+
model = read_yaml_model(src)
197+
pass1 = tmp_path / "via_rp.yml"
198+
write_yaml_model(model, pass1)
199+
via_cobra = cobra.io.load_yaml_model(str(pass1))
200+
# cobra exposes eccodes as an attribute (setattr fall-through);
201+
# raven_python sourced it from notes, so .notes['eccodes'] should
202+
# still be present on the reloaded model.
203+
pass2 = tmp_path / "via_rp2.yml"
204+
# Promote cobra's setattr-eccodes back into notes for the writer
205+
# path. (Tests the documented integration: cobra preserves the YAML
206+
# key, raven_python.read sees it again.)
207+
again = read_yaml_model(pass1)
208+
write_yaml_model(again, pass2)
209+
final = read_yaml_model(pass2)
210+
assert final.reactions.get_by_id("R1").notes["eccodes"] == "1.1.1.1"
211+
# And cobra can still read the final result.
212+
cm = cobra.io.load_yaml_model(str(pass2))
213+
assert cm.reactions.get_by_id("R1").bounds == (-1000.0, 1000.0)

0 commit comments

Comments
 (0)