Refactor ChEBI fetcher to use search API and update tests

JR-1991 · JR-1991 · commit 70d0c59ce5ac · 2026-01-15T01:11:38.000+01:00
Refactored pyenzyme.fetcher.chebi to use the ChEBI search API for fetching entries, simplifying data models and result processing. Updated function signatures and logic to work with the new API response structure. Adjusted integration tests to expect httpx.HTTPStatusError for invalid PubChem IDs instead of ValueError.
diff --git a/pyenzyme/fetcher/chebi.py b/pyenzyme/fetcher/chebi.py
@@ -6,10 +6,10 @@
 """
 
 import re
-from typing import Dict, List, Optional
+from typing import List, Optional
 
 import httpx
-from pydantic import BaseModel, ConfigDict, RootModel
+from pydantic import BaseModel, ConfigDict, Field
 
 from pyenzyme.versions import v2
 
@@ -24,73 +24,61 @@ def __init__(self, message: str, cause: Optional[Exception] = None):
         self.cause = cause
 
 
-class ChEBIStructure(BaseModel):
-    """Chemical structure information."""
-
-    model_config = ConfigDict(extra="ignore")
-
-    smiles: Optional[str] = None
-    standard_inchi: Optional[str] = None
-    standard_inchi_key: Optional[str] = None
-
-
-class ChEBIEntryData(BaseModel):
-    """Core data structure for a ChEBI entry."""
+class ChebiSearchSource(BaseModel):
+    """Source data structure from ChEBI search API result."""
 
     model_config = ConfigDict(extra="ignore")
 
+    chebi_accession: str
+    name: Optional[str] = None
     ascii_name: str
-    default_structure: Optional[ChEBIStructure] = None
-
-
-class ChEBIEntryResult(BaseModel):
-    """Individual ChEBI entry result."""
-
-    model_config = ConfigDict(extra="ignore")
-
-    standardized_chebi_id: str
-    data: ChEBIEntryData
-
-
-class ChEBIApiResponse(RootModel[Dict[str, ChEBIEntryResult]]):
-    """Top-level response structure from ChEBI API. Maps ChEBI IDs to their corresponding entry data."""
-
-    root: Dict[str, ChEBIEntryResult]
+    smiles: Optional[str] = None
+    inchi: Optional[str] = None
+    inchikey: Optional[str] = None
+    definition: Optional[str] = None
+    formula: Optional[str] = None
+    charge: Optional[int] = None
+    mass: Optional[float] = None
+    monoisotopicmass: Optional[float] = None
+    stars: Optional[int] = None
+    default_structure: Optional[int] = None
+    structures: Optional[List[int]] = None
 
 
 class ChebiSearchResult(BaseModel):
     """Individual search result structure."""
 
-    model_config = ConfigDict(extra="ignore")
+    model_config = ConfigDict(extra="ignore", populate_by_name=True)
 
-    _source: Dict[str, str]  # Contains chebi_accession field
+    source: ChebiSearchSource = Field(alias="_source")
 
 
 class ChebiSearchResponse(BaseModel):
     """Search response structure from ChEBI search API."""
 
     results: List[ChebiSearchResult]
+    total: int
+    number_pages: int
 
 
 class ChEBIClient:
     """Client for accessing the ChEBI API to fetch chemical entity data."""
 
-    BASE_URL = "https://www.ebi.ac.uk/chebi/backend/api/public/compounds/"
     SEARCH_URL = "https://www.ebi.ac.uk/chebi/backend/api/public/es_search/"
 
     def __init__(self):
         """Initialize the ChEBI client."""
         pass
 
-    def get_entry_by_id(self, chebi_id: str) -> ChEBIEntryResult:
+    def get_entry_by_id(self, chebi_id: str) -> ChebiSearchSource:
         """
-        Fetch a ChEBI entry by its ID.
+        Fetch a ChEBI entry by its ID using the search API.
 
         Args:
             chebi_id: The ChEBI ID to fetch, can be with or without the 'CHEBI:' prefix
 
         Returns:
-            ChEBIEntryResult object with the parsed response data
+            ChebiSearchSource object with the parsed response data
 
         Raises:
             ChEBIError: If the ChEBI ID is invalid or not found
@@ -101,21 +89,18 @@ def get_entry_by_id(self, chebi_id: str) -> ChEBIEntryResult:
 
         try:
             with httpx.Client(timeout=DEFAULT_TIMEOUT) as client:
-                url = self.BASE_URL.format(chebi_id)
-                response = client.get(url)
+                params = {"term": chebi_id, "page": "1", "size": "1"}
+                response = client.get(self.SEARCH_URL, params=params)
                 response.raise_for_status()
 
             if response.status_code == 200:
                 try:
-                    raw_response_data = response.json()
+                    search_response = ChebiSearchResponse(**response.json())
 
-                    if not raw_response_data or len(raw_response_data) == 0:
+                    if not search_response.results or len(search_response.results) == 0:
                         raise ChEBIError(f"No data found for ChEBI ID {chebi_id}")
 
-                    chebi_response = ChEBIApiResponse(raw_response_data)
-
-                    entry = list(chebi_response.root.values())[0]
-                    return entry
+                    return search_response.results[0].source
 
                 except Exception as e:
                     if isinstance(e, ChEBIError):
@@ -127,15 +112,15 @@ def get_entry_by_id(self, chebi_id: str) -> ChEBIEntryResult:
         except httpx.HTTPStatusError as e:
             raise ChEBIError(f"Failed to fetch ChEBI ID {chebi_id}: {str(e)}", e)
 
-    def get_entries_batch(self, chebi_ids: List[str]) -> List[ChEBIEntryResult]:
+    def get_entries_batch(self, chebi_ids: List[str]) -> List[ChebiSearchSource]:
         """
-        Fetch multiple ChEBI entries by their IDs.
+        Fetch multiple ChEBI entries by their IDs using the search API.
 
         Args:
             chebi_ids: List of ChEBI IDs to fetch
 
         Returns:
-            List of ChEBIEntryResult objects with data from ChEBI
+            List of ChebiSearchSource objects with data from ChEBI
 
         Raises:
             ChEBIError: If any ChEBI ID is invalid or not found
@@ -144,74 +129,51 @@ def get_entries_batch(self, chebi_ids: List[str]) -> List[ChEBIEntryResult]:
         if not chebi_ids:
             return []
 
-        formatted_ids = []
+        results = []
         for chebi_id in chebi_ids:
-            if not chebi_id.startswith("CHEBI:"):
-                formatted_ids.append(f"CHEBI:{chebi_id}")
-            else:
-                formatted_ids.append(chebi_id)
-
-        try:
-            with httpx.Client(timeout=DEFAULT_TIMEOUT) as client:
-                url = self.BASE_URL.format(chebi_id)
-                response = client.get(url)
-                response.raise_for_status()
-
-            if response.status_code == 200:
-                try:
-                    raw_response_data = response.json()
-                    chebi_response = ChEBIApiResponse(raw_response_data)
-                    return list(chebi_response.root.values())
-
-                except Exception as e:
-                    raise ChEBIError(
-                        f"Failed to parse ChEBI batch response: {str(e)}", e
-                    )
-            else:
-                raise ChEBIError(f"HTTP {response.status_code}: {response.text}")
+            try:
+                entry = self.get_entry_by_id(chebi_id)
+                results.append(entry)
+            except ChEBIError as e:
+                # Continue with other IDs even if one fails
+                raise ChEBIError(f"Failed to fetch ChEBI ID {chebi_id}: {str(e)}", e)
 
-        except httpx.HTTPStatusError as e:
-            raise ChEBIError(f"Failed to fetch ChEBI batch: {str(e)}", e)
+        return results
 
     def search_entries(
-        self, query: str, size: Optional[int] = None
-    ) -> List[ChEBIEntryResult]:
+        self, query: str, size: Optional[int] = None, page: int = 1
+    ) -> List[ChebiSearchSource]:
         """
         Search for ChEBI entries by query string.
 
         Args:
             query: The search query string to find ChEBI entries
             size: The maximum number of search results to return
+            page: The page number to retrieve (default: 1)
 
         Returns:
-            List of ChEBIEntryResult objects for matching entries
+            List of ChebiSearchSource objects for matching entries
 
         Raises:
             ChEBIError: If the search request fails or the API is unavailable
         """
-        params = {"term": query}
+        params = {"term": query, "page": str(page)}
         if size:
             params["size"] = str(size)
 
         try:
             with httpx.Client(timeout=DEFAULT_TIMEOUT) as client:
-                url = self.SEARCH_URL
-                response = client.get(url, params=params)
+                response = client.get(self.SEARCH_URL, params=params)
             response.raise_for_status()
 
             if response.status_code == 200:
                 try:
-                    search_results = ChebiSearchResponse(**response.json())
+                    search_response = ChebiSearchResponse(**response.json())
 
-                    if not search_results.results:
+                    if not search_response.results:
                         return []
 
-                    chebi_ids = [
-                        result._source["chebi_accession"]
-                        for result in search_results.results
-                    ]
-
-                    return self.get_entries_batch(chebi_ids)
+                    return [result.source for result in search_response.results]
 
                 except Exception as e:
                     if isinstance(e, ChEBIError):
@@ -226,44 +188,39 @@ def search_entries(
             raise ChEBIError(f"Failed to search ChEBI: {str(e)}", e)
 
 
-def process_chebi_entry(entry: ChEBIEntryResult) -> v2.SmallMolecule:
+def process_search_result(source: ChebiSearchSource) -> v2.SmallMolecule:
     """
-    Process a ChEBI entry result and convert it to a SmallMolecule object.
+    Process a ChEBI search result source and convert it to a SmallMolecule object.
 
     Args:
-        entry: The ChEBI entry result from the API
+        source: The ChEBI search result source from the API
 
     Returns:
         A SmallMolecule object with mapped data
     """
-    smallmol_id = process_id(entry.data.ascii_name)
-
-    structure = entry.data.default_structure
-    canonical_smiles = structure.smiles if structure else None
-    inchi = structure.standard_inchi if structure else None
-    inchikey = structure.standard_inchi_key if structure else None
+    smallmol_id = process_id(source.ascii_name)
 
     small_molecule = v2.SmallMolecule(
         id=smallmol_id,
-        name=entry.data.ascii_name,
-        canonical_smiles=canonical_smiles,
-        inchi=inchi,
-        inchikey=inchikey,
+        name=source.ascii_name,
+        canonical_smiles=source.smiles,
+        inchi=source.inchi,
+        inchikey=source.inchikey,
         constant=False,
         vessel_id=None,
         synonymous_names=[],
         references=[
-            f"https://www.ebi.ac.uk/chebi/searchId.do?chebiId={entry.standardized_chebi_id}"
+            f"https://www.ebi.ac.uk/chebi/searchId.do?chebiId={source.chebi_accession}"
         ],
     )
 
     small_molecule.add_type_term(
-        term=f"OBO:{entry.standardized_chebi_id.replace(':', '_')}",
+        term=f"OBO:{source.chebi_accession.replace(':', '_')}",
         prefix="OBO",
         iri="http://purl.obolibrary.org/obo/",
     )
 
-    small_molecule.ld_id = f"OBO:{entry.standardized_chebi_id.replace(':', '_')}"
+    small_molecule.ld_id = f"OBO:{source.chebi_accession.replace(':', '_')}"
 
     return small_molecule
 
@@ -290,9 +247,9 @@ def fetch_chebi(
     """
     try:
         client = ChEBIClient()
-        chebi_entry = client.get_entry_by_id(chebi_id)
+        chebi_source = client.get_entry_by_id(chebi_id)
 
-        small_molecule = process_chebi_entry(chebi_entry)
+        small_molecule = process_search_result(chebi_source)
 
         if smallmol_id is not None:
             small_molecule.id = smallmol_id
@@ -325,9 +282,9 @@ def fetch_chebi_batch(chebi_ids: List[str]) -> List[v2.SmallMolecule]:
         return []
 
     client = ChEBIClient()
-    chebi_entries = client.get_entries_batch(chebi_ids)
+    chebi_sources = client.get_entries_batch(chebi_ids)
 
-    return [process_chebi_entry(entry) for entry in chebi_entries]
+    return [process_search_result(source) for source in chebi_sources]
 
 
 def search_chebi(query: str, size: Optional[int] = None) -> List[v2.SmallMolecule]:
@@ -355,9 +312,9 @@ def search_chebi(query: str, size: Optional[int] = None) -> List[v2.SmallMolecul
         atp_results = search_chebi('ATP', 5)
     """
     client = ChEBIClient()
-    chebi_entries = client.search_entries(query, size)
+    chebi_sources = client.search_entries(query, size)
 
-    return [process_chebi_entry(entry) for entry in chebi_entries]
+    return [process_search_result(source) for source in chebi_sources]
 
 
 def process_id(name: str) -> str:
diff --git a/pyenzyme/fetcher/pubchem.py b/pyenzyme/fetcher/pubchem.py
@@ -98,9 +98,6 @@ def from_cid(cid: int) -> PubChemQuery:
             response = client.get(url)
             response.raise_for_status()
 
-        if response.status_code != 200:
-            raise ValueError(f"Failed to fetch PubChem data for CID {cid}")
-
         return PubChemQuery(**response.json())
 
     @staticmethod
diff --git a/tests/integration/test_fetcher.py b/tests/integration/test_fetcher.py
@@ -1,3 +1,4 @@
+import httpx
 import pytest
 
 from pyenzyme.fetcher.chebi import fetch_chebi
@@ -155,7 +156,7 @@ def test_fetch_pubchem_to_small_molecule_with_prefix(self):
 
     @pytest.mark.remote
     def test_fetch_pubchem_to_small_molecule_invalid_id(self):
-        with pytest.raises(ValueError):
+        with pytest.raises(httpx.HTTPStatusError):
             fetch_pubchem(cid="162176127617627")
 
     @pytest.mark.remote