PyEnzyme/pyenzyme/thinlayers/base.py at 7095705de6ffdd1df0485d9ca1664863b02d0192 · EnzymeML/PyEnzyme · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
from abc import ABC, abstractmethod
from functools import cached_property
from typing import Dict, List, Optional, Set, Tuple, TypeAlias

import pandas as pd
from sympy import Symbol, sympify

import pyenzyme as pe
from pyenzyme.versions import v2

# Type aliases for usage across the thinlayers
# Easier to read and understand than using type hints
InitCondDict: TypeAlias = Dict[str, float]
SimResult: TypeAlias = Dict[str, List[float]]
Time: TypeAlias = List[float]


class BaseThinLayer(ABC):
    """
    Base class for thin layers that wrap EnzymeML documents.

    This class provides a foundation for creating specialized interfaces to EnzymeML documents,
    with built-in conversion to SBML and pandas DataFrames. It allows filtering measurements
    by their IDs.

    Attributes:
        enzmldoc (v2.EnzymeMLDocument): The EnzymeML document to wrap.
        measurement_ids (Optional[List[str]]): Optional list of measurement IDs to filter by.
            If None, all measurements are included.
    """

    enzmldoc: v2.EnzymeMLDocument
    measurement_ids: List[str]
    exclude_unmodeled_species: bool = True

    def __init__(
        self,
        enzmldoc: v2.EnzymeMLDocument,
        measurement_ids: Optional[List[str]] = None,
        df_per_measurement: bool = False,
        exclude_unmodeled_species: bool = True,
    ):
        assert isinstance(enzmldoc, v2.EnzymeMLDocument)
        assert isinstance(measurement_ids, list) or measurement_ids is None

        # Remove empty measurements
        enzmldoc.measurements = [
            meas for meas in enzmldoc.measurements if meas.species_data
        ]

        if measurement_ids is None:
            measurement_ids = [meas.id for meas in enzmldoc.measurements]

        self.enzmldoc = enzmldoc.model_copy(deep=True)
        self.fitted_doc = enzmldoc.model_copy(deep=True)
        self.measurement_ids = measurement_ids
        self.df_per_measurement = df_per_measurement
        self.exclude_unmodeled_species = exclude_unmodeled_species

    @staticmethod
    def _remove_unmodeled_species(enzmldoc: v2.EnzymeMLDocument) -> v2.EnzymeMLDocument:
        """
        Removes species that are not modeled from the EnzymeML document.

        This method filters out species that are not referenced in any reactions or ODEs,
        cleaning up the document to only include modeled species. It also removes
        measurements that have no remaining species data after filtering.

        Args:
            enzmldoc (v2.EnzymeMLDocument): The EnzymeML document to filter.

        Returns:
            v2.EnzymeMLDocument: A deep copy of the document with unmodeled species removed.

        Note:
            - Creates a deep copy to avoid modifying the original document
            - Removes measurements that become empty after species filtering
            - Only considers species from reactions (reactants/products) and ODE equations
        """
        enzmldoc = enzmldoc.model_copy(deep=True)

        # Collect all species that are explicitly modeled
        all_species = BaseThinLayer._get_all_species(enzmldoc)
        modeled_species = set()
        equations = []

        # Add species from reactions (reactants and products)
        for reaction in enzmldoc.reactions:
            modeled_species.update(
                reactant.species_id for reactant in reaction.reactants
            )
            modeled_species.update(product.species_id for product in reaction.products)

            if reaction.kinetic_law:
                equations.append(sympify(reaction.kinetic_law.equation))

        # Add species from ODE equations
        for equation in enzmldoc.equations:
            if equation.equation_type == v2.EquationType.ODE:
                equations.append(sympify(equation.equation))
                modeled_species.add(equation.species_id)
            elif (
                equation.equation_type == v2.EquationType.ASSIGNMENT
                or equation.equation_type == v2.EquationType.INITIAL_ASSIGNMENT
            ):
                equations.append(sympify(equation.equation))

        # Find species referenced in equations
        equation_symbols = {symbol for eq in equations for symbol in eq.free_symbols}
        modeled_species.update(
            species for species in all_species if Symbol(species) in equation_symbols
        )

        if not modeled_species:
            enzmldoc.measurements = []
            enzmldoc.small_molecules = []
            enzmldoc.proteins = []
            enzmldoc.complexes = []
            return enzmldoc

        filtered_measurements = []
        for measurement in enzmldoc.measurements:
            # Filter species data to only include modeled species
            filtered_species_data = [
                data
                for data in measurement.species_data
                if data.species_id in modeled_species
            ]

            # Only keep measurements that still have species data
            if filtered_species_data:
                measurement.species_data = filtered_species_data
                filtered_measurements.append(measurement)

        # Update all collections to only include modeled species
        enzmldoc.measurements = filtered_measurements
        enzmldoc.small_molecules = [
            species
            for species in enzmldoc.small_molecules
            if species.id in modeled_species
        ]
        enzmldoc.proteins = [
            protein for protein in enzmldoc.proteins if protein.id in modeled_species
        ]
        enzmldoc.complexes = [
            complex for complex in enzmldoc.complexes if complex.id in modeled_species
        ]

        return enzmldoc

    @staticmethod
    def _get_all_species(enzmldoc: v2.EnzymeMLDocument) -> Set[str]:
        """
        Gets all species from the EnzymeML document.
        """
        return set(
            species.id
            for species in enzmldoc.small_molecules
            + enzmldoc.proteins
            + enzmldoc.complexes
        )

    @abstractmethod
    def integrate(
        self,
        model: v2.EnzymeMLDocument,
        initial_conditions: InitCondDict,
        t0: float,
        t1: float,
        nsteps: int = 100,
    ) -> Tuple[SimResult, Time]:
        """
        Integrates the model from t0 to t1 with the given initial conditions.

        Args:
            model (v2.EnzymeMLDocument): EnzymeML document containing the model.
            initial_conditions (InitCondDict): Dictionary mapping species IDs to initial concentrations.
            t0 (float): Start time for integration.
            t1 (float): End time for integration.
            nsteps (int, optional): Number of time points to generate. Defaults to 100.

        Returns:
            Tuple[SimResult, Time]: A tuple containing:
                - Dict mapping species IDs to concentration trajectories.
                - List of time points.

        Examples:
            >>> # Simulate model with initial conditions
            >>> species_data, time_points = thinlayer.integrate(
            ...     model=doc,
            ...     initial_conditions={"S1": 10.0, "S2": 0.0},
            ...     t0=0.0,
            ...     t1=100.0,
            ...     nsteps=200
            ... )
        """
        pass

    @abstractmethod
    def optimize(self, **kwargs):
        """
        Optimizes the model parameters.

        This method should implement parameter optimization for the model,
        typically fitting to experimental data contained in the EnzymeML document.

        Args:
            **kwargs: Implementation-specific keyword arguments.

        Returns:
            Implementation-specific optimization results.

        Examples:
            >>> # Optimize model parameters
            >>> result = thinlayer.optimize(**kwargs)
            >>> print(f"Optimization success: {result.success}")
        """
        pass

    @abstractmethod
    def write(self) -> v2.EnzymeMLDocument:
        """
        Writes the optimized model parameters to a copy of the EnzymeMLDocument.

        This method creates a new EnzymeML document with updated parameter values
        based on optimization results.

        Returns:
            v2.EnzymeMLDocument: A new EnzymeML document with optimized parameters.

        Examples:
            >>> # Get optimized document after parameter fitting
            >>> thinlayer.optimize()
            >>> optimized_doc = thinlayer.write()
            >>> pe.write_enzymeml(optimized_doc, "optimized_model.json")
        """
        pass

    @staticmethod
    def _check_measurement_ids(enzmldoc: v2.EnzymeMLDocument):
        """
        Validates that the EnzymeML document contains at least one measurement.

        Args:
            enzmldoc (v2.EnzymeMLDocument): The EnzymeML document to validate.

        Returns:
            v2.EnzymeMLDocument: The validated EnzymeML document.

        Raises:
            ValueError: If the EnzymeML document has no measurements.

        Examples:
            >>> # Validate document has measurements
            >>> validated_doc = BaseThinLayer.check_measurement_ids(doc)
        """
        if len(enzmldoc.measurements) == 0:
            raise ValueError("EnzymeMLDocument has no measurements")

        return enzmldoc

    @cached_property
    def sbml_xml(self) -> str:
        """
        Converts the EnzymeML document to SBML XML format.

        Returns:
            str: The SBML XML representation of the EnzymeML document.

        Examples:
            >>> # Export model as SBML
            >>> sbml_string = thinlayer.sbml_xml
            >>> with open("model.xml", "w") as f:
            ...     f.write(sbml_string)
        """
        return pe.to_sbml(self.enzmldoc)[0]

    @cached_property
    def df(self) -> pd.DataFrame:
        """
        Converts the EnzymeML document to a pandas DataFrame.

        If measurement_ids is specified, only those measurements are included in the result.

        Returns:
            pd.DataFrame: A DataFrame containing time series data for all measurements
                or only the specified measurements.

        Raises:
            ValueError: If the conversion doesn't return a DataFrame.
        """
        if self.exclude_unmodeled_species:
            enzmldoc = self._remove_unmodeled_species(self.enzmldoc)
        else:
            enzmldoc = self.enzmldoc

        df = pe.to_pandas(enzmldoc, per_measurement=False)

        # Drop all this rows where "id" is within measurement_ids
        df = (
            df
            if self.measurement_ids is None
            else df[df["id"].isin(self.measurement_ids)]  # type: ignore
        )

        if not isinstance(df, pd.DataFrame):
            raise ValueError("Expected a single dataframe")

        return df

    @cached_property
    def df_map(self) -> dict[str, pd.DataFrame]:
        """
        Converts the EnzymeML document to pandas DataFrames, organized by measurement ID.

        If measurement_ids is specified, only those measurements are included in the result.

        Returns:
            dict[str, pd.DataFrame]: A dictionary mapping measurement IDs to their corresponding
                pandas DataFrames containing time series data.

        Raises:
            ValueError: If the conversion doesn't return a dictionary or if specified
                measurement IDs are not found in the document.
        """

        df_map = pe.to_pandas(self.enzmldoc, per_measurement=True)

        if not isinstance(df_map, dict):
            raise ValueError("Expected a dictionary of dataframes")

        if self.measurement_ids is None:
            return df_map

        missing_ids = set(self.measurement_ids) - set(df_map.keys())
        if missing_ids:
            raise ValueError(f"Measurement ids {missing_ids} not found in data")

        return {k: v for k, v in df_map.items() if k in self.measurement_ids}