Skip to content

Commit 70d0c59

Browse files
committed
Refactor ChEBI fetcher to use search API and update tests
Refactored pyenzyme.fetcher.chebi to use the ChEBI search API for fetching entries, simplifying data models and result processing. Updated function signatures and logic to work with the new API response structure. Adjusted integration tests to expect httpx.HTTPStatusError for invalid PubChem IDs instead of ValueError.
1 parent 9f8f0d2 commit 70d0c59

3 files changed

Lines changed: 68 additions & 113 deletions

File tree

pyenzyme/fetcher/chebi.py

Lines changed: 66 additions & 109 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@
66
"""
77

88
import re
9-
from typing import Dict, List, Optional
9+
from typing import List, Optional
1010

1111
import httpx
12-
from pydantic import BaseModel, ConfigDict, RootModel
12+
from pydantic import BaseModel, ConfigDict, Field
1313

1414
from pyenzyme.versions import v2
1515

@@ -24,73 +24,61 @@ def __init__(self, message: str, cause: Optional[Exception] = None):
2424
self.cause = cause
2525

2626

27-
class ChEBIStructure(BaseModel):
28-
"""Chemical structure information."""
29-
30-
model_config = ConfigDict(extra="ignore")
31-
32-
smiles: Optional[str] = None
33-
standard_inchi: Optional[str] = None
34-
standard_inchi_key: Optional[str] = None
35-
36-
37-
class ChEBIEntryData(BaseModel):
38-
"""Core data structure for a ChEBI entry."""
27+
class ChebiSearchSource(BaseModel):
28+
"""Source data structure from ChEBI search API result."""
3929

4030
model_config = ConfigDict(extra="ignore")
4131

32+
chebi_accession: str
33+
name: Optional[str] = None
4234
ascii_name: str
43-
default_structure: Optional[ChEBIStructure] = None
44-
45-
46-
class ChEBIEntryResult(BaseModel):
47-
"""Individual ChEBI entry result."""
48-
49-
model_config = ConfigDict(extra="ignore")
50-
51-
standardized_chebi_id: str
52-
data: ChEBIEntryData
53-
54-
55-
class ChEBIApiResponse(RootModel[Dict[str, ChEBIEntryResult]]):
56-
"""Top-level response structure from ChEBI API. Maps ChEBI IDs to their corresponding entry data."""
57-
58-
root: Dict[str, ChEBIEntryResult]
35+
smiles: Optional[str] = None
36+
inchi: Optional[str] = None
37+
inchikey: Optional[str] = None
38+
definition: Optional[str] = None
39+
formula: Optional[str] = None
40+
charge: Optional[int] = None
41+
mass: Optional[float] = None
42+
monoisotopicmass: Optional[float] = None
43+
stars: Optional[int] = None
44+
default_structure: Optional[int] = None
45+
structures: Optional[List[int]] = None
5946

6047

6148
class ChebiSearchResult(BaseModel):
6249
"""Individual search result structure."""
6350

64-
model_config = ConfigDict(extra="ignore")
51+
model_config = ConfigDict(extra="ignore", populate_by_name=True)
6552

66-
_source: Dict[str, str] # Contains chebi_accession field
53+
source: ChebiSearchSource = Field(alias="_source")
6754

6855

6956
class ChebiSearchResponse(BaseModel):
7057
"""Search response structure from ChEBI search API."""
7158

7259
results: List[ChebiSearchResult]
60+
total: int
61+
number_pages: int
7362

7463

7564
class ChEBIClient:
7665
"""Client for accessing the ChEBI API to fetch chemical entity data."""
7766

78-
BASE_URL = "https://www.ebi.ac.uk/chebi/backend/api/public/compounds/"
7967
SEARCH_URL = "https://www.ebi.ac.uk/chebi/backend/api/public/es_search/"
8068

8169
def __init__(self):
8270
"""Initialize the ChEBI client."""
8371
pass
8472

85-
def get_entry_by_id(self, chebi_id: str) -> ChEBIEntryResult:
73+
def get_entry_by_id(self, chebi_id: str) -> ChebiSearchSource:
8674
"""
87-
Fetch a ChEBI entry by its ID.
75+
Fetch a ChEBI entry by its ID using the search API.
8876
8977
Args:
9078
chebi_id: The ChEBI ID to fetch, can be with or without the 'CHEBI:' prefix
9179
9280
Returns:
93-
ChEBIEntryResult object with the parsed response data
81+
ChebiSearchSource object with the parsed response data
9482
9583
Raises:
9684
ChEBIError: If the ChEBI ID is invalid or not found
@@ -101,21 +89,18 @@ def get_entry_by_id(self, chebi_id: str) -> ChEBIEntryResult:
10189

10290
try:
10391
with httpx.Client(timeout=DEFAULT_TIMEOUT) as client:
104-
url = self.BASE_URL.format(chebi_id)
105-
response = client.get(url)
92+
params = {"term": chebi_id, "page": "1", "size": "1"}
93+
response = client.get(self.SEARCH_URL, params=params)
10694
response.raise_for_status()
10795

10896
if response.status_code == 200:
10997
try:
110-
raw_response_data = response.json()
98+
search_response = ChebiSearchResponse(**response.json())
11199

112-
if not raw_response_data or len(raw_response_data) == 0:
100+
if not search_response.results or len(search_response.results) == 0:
113101
raise ChEBIError(f"No data found for ChEBI ID {chebi_id}")
114102

115-
chebi_response = ChEBIApiResponse(raw_response_data)
116-
117-
entry = list(chebi_response.root.values())[0]
118-
return entry
103+
return search_response.results[0].source
119104

120105
except Exception as e:
121106
if isinstance(e, ChEBIError):
@@ -127,15 +112,15 @@ def get_entry_by_id(self, chebi_id: str) -> ChEBIEntryResult:
127112
except httpx.HTTPStatusError as e:
128113
raise ChEBIError(f"Failed to fetch ChEBI ID {chebi_id}: {str(e)}", e)
129114

130-
def get_entries_batch(self, chebi_ids: List[str]) -> List[ChEBIEntryResult]:
115+
def get_entries_batch(self, chebi_ids: List[str]) -> List[ChebiSearchSource]:
131116
"""
132-
Fetch multiple ChEBI entries by their IDs.
117+
Fetch multiple ChEBI entries by their IDs using the search API.
133118
134119
Args:
135120
chebi_ids: List of ChEBI IDs to fetch
136121
137122
Returns:
138-
List of ChEBIEntryResult objects with data from ChEBI
123+
List of ChebiSearchSource objects with data from ChEBI
139124
140125
Raises:
141126
ChEBIError: If any ChEBI ID is invalid or not found
@@ -144,74 +129,51 @@ def get_entries_batch(self, chebi_ids: List[str]) -> List[ChEBIEntryResult]:
144129
if not chebi_ids:
145130
return []
146131

147-
formatted_ids = []
132+
results = []
148133
for chebi_id in chebi_ids:
149-
if not chebi_id.startswith("CHEBI:"):
150-
formatted_ids.append(f"CHEBI:{chebi_id}")
151-
else:
152-
formatted_ids.append(chebi_id)
153-
154-
try:
155-
with httpx.Client(timeout=DEFAULT_TIMEOUT) as client:
156-
url = self.BASE_URL.format(chebi_id)
157-
response = client.get(url)
158-
response.raise_for_status()
159-
160-
if response.status_code == 200:
161-
try:
162-
raw_response_data = response.json()
163-
chebi_response = ChEBIApiResponse(raw_response_data)
164-
return list(chebi_response.root.values())
165-
166-
except Exception as e:
167-
raise ChEBIError(
168-
f"Failed to parse ChEBI batch response: {str(e)}", e
169-
)
170-
else:
171-
raise ChEBIError(f"HTTP {response.status_code}: {response.text}")
134+
try:
135+
entry = self.get_entry_by_id(chebi_id)
136+
results.append(entry)
137+
except ChEBIError as e:
138+
# Continue with other IDs even if one fails
139+
raise ChEBIError(f"Failed to fetch ChEBI ID {chebi_id}: {str(e)}", e)
172140

173-
except httpx.HTTPStatusError as e:
174-
raise ChEBIError(f"Failed to fetch ChEBI batch: {str(e)}", e)
141+
return results
175142

176143
def search_entries(
177-
self, query: str, size: Optional[int] = None
178-
) -> List[ChEBIEntryResult]:
144+
self, query: str, size: Optional[int] = None, page: int = 1
145+
) -> List[ChebiSearchSource]:
179146
"""
180147
Search for ChEBI entries by query string.
181148
182149
Args:
183150
query: The search query string to find ChEBI entries
184151
size: The maximum number of search results to return
152+
page: The page number to retrieve (default: 1)
185153
186154
Returns:
187-
List of ChEBIEntryResult objects for matching entries
155+
List of ChebiSearchSource objects for matching entries
188156
189157
Raises:
190158
ChEBIError: If the search request fails or the API is unavailable
191159
"""
192-
params = {"term": query}
160+
params = {"term": query, "page": str(page)}
193161
if size:
194162
params["size"] = str(size)
195163

196164
try:
197165
with httpx.Client(timeout=DEFAULT_TIMEOUT) as client:
198-
url = self.SEARCH_URL
199-
response = client.get(url, params=params)
166+
response = client.get(self.SEARCH_URL, params=params)
200167
response.raise_for_status()
201168

202169
if response.status_code == 200:
203170
try:
204-
search_results = ChebiSearchResponse(**response.json())
171+
search_response = ChebiSearchResponse(**response.json())
205172

206-
if not search_results.results:
173+
if not search_response.results:
207174
return []
208175

209-
chebi_ids = [
210-
result._source["chebi_accession"]
211-
for result in search_results.results
212-
]
213-
214-
return self.get_entries_batch(chebi_ids)
176+
return [result.source for result in search_response.results]
215177

216178
except Exception as e:
217179
if isinstance(e, ChEBIError):
@@ -226,44 +188,39 @@ def search_entries(
226188
raise ChEBIError(f"Failed to search ChEBI: {str(e)}", e)
227189

228190

229-
def process_chebi_entry(entry: ChEBIEntryResult) -> v2.SmallMolecule:
191+
def process_search_result(source: ChebiSearchSource) -> v2.SmallMolecule:
230192
"""
231-
Process a ChEBI entry result and convert it to a SmallMolecule object.
193+
Process a ChEBI search result source and convert it to a SmallMolecule object.
232194
233195
Args:
234-
entry: The ChEBI entry result from the API
196+
source: The ChEBI search result source from the API
235197
236198
Returns:
237199
A SmallMolecule object with mapped data
238200
"""
239-
smallmol_id = process_id(entry.data.ascii_name)
240-
241-
structure = entry.data.default_structure
242-
canonical_smiles = structure.smiles if structure else None
243-
inchi = structure.standard_inchi if structure else None
244-
inchikey = structure.standard_inchi_key if structure else None
201+
smallmol_id = process_id(source.ascii_name)
245202

246203
small_molecule = v2.SmallMolecule(
247204
id=smallmol_id,
248-
name=entry.data.ascii_name,
249-
canonical_smiles=canonical_smiles,
250-
inchi=inchi,
251-
inchikey=inchikey,
205+
name=source.ascii_name,
206+
canonical_smiles=source.smiles,
207+
inchi=source.inchi,
208+
inchikey=source.inchikey,
252209
constant=False,
253210
vessel_id=None,
254211
synonymous_names=[],
255212
references=[
256-
f"https://www.ebi.ac.uk/chebi/searchId.do?chebiId={entry.standardized_chebi_id}"
213+
f"https://www.ebi.ac.uk/chebi/searchId.do?chebiId={source.chebi_accession}"
257214
],
258215
)
259216

260217
small_molecule.add_type_term(
261-
term=f"OBO:{entry.standardized_chebi_id.replace(':', '_')}",
218+
term=f"OBO:{source.chebi_accession.replace(':', '_')}",
262219
prefix="OBO",
263220
iri="http://purl.obolibrary.org/obo/",
264221
)
265222

266-
small_molecule.ld_id = f"OBO:{entry.standardized_chebi_id.replace(':', '_')}"
223+
small_molecule.ld_id = f"OBO:{source.chebi_accession.replace(':', '_')}"
267224

268225
return small_molecule
269226

@@ -290,9 +247,9 @@ def fetch_chebi(
290247
"""
291248
try:
292249
client = ChEBIClient()
293-
chebi_entry = client.get_entry_by_id(chebi_id)
250+
chebi_source = client.get_entry_by_id(chebi_id)
294251

295-
small_molecule = process_chebi_entry(chebi_entry)
252+
small_molecule = process_search_result(chebi_source)
296253

297254
if smallmol_id is not None:
298255
small_molecule.id = smallmol_id
@@ -325,9 +282,9 @@ def fetch_chebi_batch(chebi_ids: List[str]) -> List[v2.SmallMolecule]:
325282
return []
326283

327284
client = ChEBIClient()
328-
chebi_entries = client.get_entries_batch(chebi_ids)
285+
chebi_sources = client.get_entries_batch(chebi_ids)
329286

330-
return [process_chebi_entry(entry) for entry in chebi_entries]
287+
return [process_search_result(source) for source in chebi_sources]
331288

332289

333290
def search_chebi(query: str, size: Optional[int] = None) -> List[v2.SmallMolecule]:
@@ -355,9 +312,9 @@ def search_chebi(query: str, size: Optional[int] = None) -> List[v2.SmallMolecul
355312
atp_results = search_chebi('ATP', 5)
356313
"""
357314
client = ChEBIClient()
358-
chebi_entries = client.search_entries(query, size)
315+
chebi_sources = client.search_entries(query, size)
359316

360-
return [process_chebi_entry(entry) for entry in chebi_entries]
317+
return [process_search_result(source) for source in chebi_sources]
361318

362319

363320
def process_id(name: str) -> str:

pyenzyme/fetcher/pubchem.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -98,9 +98,6 @@ def from_cid(cid: int) -> PubChemQuery:
9898
response = client.get(url)
9999
response.raise_for_status()
100100

101-
if response.status_code != 200:
102-
raise ValueError(f"Failed to fetch PubChem data for CID {cid}")
103-
104101
return PubChemQuery(**response.json())
105102

106103
@staticmethod

tests/integration/test_fetcher.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import httpx
12
import pytest
23

34
from pyenzyme.fetcher.chebi import fetch_chebi
@@ -155,7 +156,7 @@ def test_fetch_pubchem_to_small_molecule_with_prefix(self):
155156

156157
@pytest.mark.remote
157158
def test_fetch_pubchem_to_small_molecule_invalid_id(self):
158-
with pytest.raises(ValueError):
159+
with pytest.raises(httpx.HTTPStatusError):
159160
fetch_pubchem(cid="162176127617627")
160161

161162
@pytest.mark.remote

0 commit comments

Comments
 (0)