Skip to content

Commit 39adaf6

Browse files
committed
feat(io.ec_data): add EcData.validate + EcData.empty
Two shape-management helpers that consumers (geckopy's pipeline, test fixtures) need on top of the raw dataclass: - validate(): raise ValueError when per-rxn array lengths, per-enzyme array lengths, or the rxn_enz_mat shape drift from one another. Cheap; callable after each mutation in a builder pipeline. - EcData.empty(n_rxns, n_enzymes, *, gecko_light=False): preallocate with the canonical sentinels (empty strings for the string fields, 0 for kcat, NaN for mw/concs, empty CSR matrix). Used by builders that allocate up-front and fill row by row. Both methods are shape-level operations, not algorithm, so they live with the dataclass rather than on a downstream consumer. Tests: 6 new EcData tests covering empty's sentinels, validate's three drift paths (per-rxn length, per-enzyme length, coupling-matrix shape), the empty -> validate round-trip, and the gecko_light flag on empty.
1 parent cbe5f59 commit 39adaf6

2 files changed

Lines changed: 121 additions & 0 deletions

File tree

src/raven_python/io/ec_data.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,80 @@ def n_rxns(self) -> int:
107107
def n_enzymes(self) -> int:
108108
return len(self.enzymes)
109109

110+
def validate(self) -> None:
111+
"""Raise ``ValueError`` if internal field lengths are inconsistent.
112+
113+
Cheap sanity check: catches accidental drift between the per-rxn
114+
arrays, the per-enzyme arrays, and the coupling matrix shape.
115+
Called by pipeline stages after they mutate the data, and by the
116+
YAML loader after construction.
117+
"""
118+
n_r, n_e = self.n_rxns, self.n_enzymes
119+
120+
rxn_lengths = {
121+
"kcat": len(self.kcat),
122+
"source": len(self.source),
123+
"notes": len(self.notes),
124+
"eccodes": len(self.eccodes),
125+
}
126+
for name, length in rxn_lengths.items():
127+
if length != n_r:
128+
raise ValueError(
129+
f"ec.{name} has length {length}, expected {n_r} "
130+
f"(matching ec.rxns)"
131+
)
132+
133+
# `ec.enzymes` itself is the reference length; check the remaining
134+
# per-enzyme arrays against it.
135+
enz_lengths = {
136+
"genes": len(self.genes),
137+
"mw": len(self.mw),
138+
"sequence": len(self.sequence),
139+
"concs": len(self.concs),
140+
}
141+
for name, length in enz_lengths.items():
142+
if length != n_e:
143+
raise ValueError(
144+
f"ec.{name} has length {length}, expected {n_e} "
145+
f"(matching ec.enzymes)"
146+
)
147+
148+
if self.rxn_enz_mat.shape != (n_r, n_e):
149+
raise ValueError(
150+
f"ec.rxn_enz_mat has shape {self.rxn_enz_mat.shape}, "
151+
f"expected ({n_r}, {n_e})"
152+
)
153+
154+
@staticmethod
155+
def empty(n_rxns: int, n_enzymes: int = 0, *,
156+
gecko_light: bool = False) -> "EcData":
157+
"""Preallocate an ``EcData`` with the canonical sentinel values.
158+
159+
Per-rxn fields get empty strings; per-enzyme fields get empty
160+
strings and NaN arrays. ``kcat`` starts at 0 (0 marks "no kcat
161+
assigned"). ``mw`` and ``concs`` start at NaN, since their
162+
physical default is "unknown" rather than zero.
163+
164+
Used by makeEcModel-style builders that allocate the structure
165+
up-front, then fill it row by row.
166+
"""
167+
return EcData(
168+
gecko_light=gecko_light,
169+
rxns=[""] * n_rxns,
170+
kcat=np.zeros(n_rxns, dtype=float),
171+
source=[""] * n_rxns,
172+
notes=[""] * n_rxns,
173+
eccodes=[""] * n_rxns,
174+
genes=[""] * n_enzymes,
175+
enzymes=[""] * n_enzymes,
176+
mw=np.full(n_enzymes, np.nan, dtype=float),
177+
sequence=[""] * n_enzymes,
178+
concs=np.full(n_enzymes, np.nan, dtype=float),
179+
rxn_enz_mat=sparse.lil_matrix(
180+
(n_rxns, n_enzymes), dtype=float,
181+
).tocsr(),
182+
)
183+
110184

111185
# --------------------------------------------------------------------------- #
112186
# Load

tests/test_io_yaml_ec_data.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -365,3 +365,50 @@ def test_model_from_yaml_data_mutates_in_place():
365365
assert "ec-rxns" not in doc
366366
assert "ec-enzymes" not in doc
367367
assert "gecko_light" not in doc
368+
369+
370+
# --------------------------------------------------------------------------- #
371+
# EcData.validate / EcData.empty
372+
# --------------------------------------------------------------------------- #
373+
374+
def test_empty_has_canonical_sentinels():
375+
"""`EcData.empty(n, m)` preallocates with the documented sentinels."""
376+
ec = EcData.empty(3, 2)
377+
assert ec.n_rxns == 3
378+
assert ec.n_enzymes == 2
379+
assert ec.rxns == ["", "", ""]
380+
assert (ec.kcat == 0).all()
381+
assert np.isnan(ec.mw).all()
382+
assert np.isnan(ec.concs).all()
383+
assert ec.rxn_enz_mat.shape == (3, 2)
384+
assert ec.rxn_enz_mat.nnz == 0
385+
386+
387+
def test_empty_round_trips_through_validate():
388+
EcData.empty(5, 4).validate() # must not raise
389+
390+
391+
def test_validate_catches_per_rxn_length_drift():
392+
ec = EcData.empty(3, 2)
393+
ec.kcat = np.array([1.0, 2.0]) # length 2, should be 3
394+
with pytest.raises(ValueError, match="ec.kcat has length 2, expected 3"):
395+
ec.validate()
396+
397+
398+
def test_validate_catches_per_enzyme_length_drift():
399+
ec = EcData.empty(3, 2)
400+
ec.mw = np.array([1.0]) # length 1, should be 2
401+
with pytest.raises(ValueError, match="ec.mw has length 1, expected 2"):
402+
ec.validate()
403+
404+
405+
def test_validate_catches_coupling_matrix_shape_drift():
406+
ec = EcData.empty(3, 2)
407+
ec.rxn_enz_mat = sparse.csr_matrix((3, 5), dtype=float)
408+
with pytest.raises(ValueError, match=r"ec.rxn_enz_mat has shape \(3, 5\)"):
409+
ec.validate()
410+
411+
412+
def test_empty_gecko_light_flag_propagates():
413+
assert EcData.empty(1, 1, gecko_light=True).gecko_light is True
414+
assert EcData.empty(1, 1).gecko_light is False

0 commit comments

Comments
 (0)