Skip to content

Commit 8f680c0

Browse files
committed
check organism id accuracy, the AI makes things up
1 parent 777c1fb commit 8f680c0

3 files changed

Lines changed: 168 additions & 47 deletions

File tree

vcell-core/src/main/java/org/vcell/pathway/PathwayXMLHelper.java

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,17 @@
1212

1313
import java.util.Hashtable;
1414

15+
import org.apache.logging.log4j.LogManager;
16+
import org.apache.logging.log4j.Logger;
1517
import org.jdom2.Attribute;
1618
import org.jdom2.Document;
1719
import org.jdom2.Element;
1820
import org.jdom2.Parent;
1921

2022
public class PathwayXMLHelper {
2123

24+
private static final Logger lg = LogManager.getLogger(PathwayXMLHelper.class);
25+
2226
// public static final Namespace vcns = Namespace.getNamespace("vcns", "vcns-something");
2327
public static final String schemaString = new String("http://www.w3.org/2001/XMLSchema#string");
2428
public static final String schemaInt = new String("http://www.w3.org/2001/XMLSchema#int");
@@ -85,50 +89,50 @@ private static String getElementPathString(Element childElement) {
8589
public static void showUnexpected(Attribute attribute, BioPaxObject bpObject) {
8690
String message = "Unexpected attribute " + getElementPathString(attribute.getParent()) + " << " + attribute.getQualifiedName();
8791
bpObject.addParserWarning(message);
88-
//System.out.println(message);
92+
lg.debug(message);
8993
}
9094
public static void showUnexpected(Object object, BioPaxObject bpObject) {
9195
String message = "Unexpected object " + object.toString();
9296
bpObject.addParserWarning(message);
93-
//System.out.println(message);
97+
lg.debug(message);
9498
}
9599
public static void showIgnored(Attribute attribute, BioPaxObject bpObject) {
96100
String message = "Ignored attribute " + getElementPathString(attribute.getParent()) + " << " + attribute.getQualifiedName();
97101
bpObject.addParserWarning(message);
98-
//System.out.println(message);
102+
lg.debug(message);
99103
}
100104
public static void showUnexpected(Element childElement, BioPaxObject bpObject) {
101105
String message = "Unexpected element " + getElementPathString(childElement);
102106
bpObject.addParserWarning(message);
103-
//System.out.println(message);
107+
lg.debug(message);
104108
}
105109
public static void showIgnored(Element childElement, String reason, BioPaxObject bpObject) {
106110
// if (!reason.contains("?")){
107111
// return;
108112
// }
109113
String message = "Ignoring element " + getElementPathString(childElement) + " " + reason;
110114
bpObject.addParserWarning(message);
111-
//System.out.println(message);
115+
lg.debug(message);
112116
}
113117
public static void showUnexpected(Attribute attribute) {
114118
String message = "Unexpected attribute " + getElementPathString(attribute.getParent()) + " << " + attribute.getQualifiedName();
115-
//System.out.println(message);
119+
lg.debug(message);
116120
}
117121
public static void showUnexpected(Object object) {
118122
String message = "Unexpected object " + object.toString();
119-
//System.out.println(message);
123+
lg.debug(message);
120124
}
121125
public static void showIgnored(Attribute attribute) {
122126
String message = "Ignored attribute " + getElementPathString(attribute.getParent()) + " << " + attribute.getQualifiedName();
123-
//System.out.println(message);
127+
lg.debug(message);
124128
}
125129
public static void showUnexpected(Element childElement) {
126130
String message = "Unexpected element " + getElementPathString(childElement);
127-
System.err.println(message);
131+
lg.debug(message);
128132
}
129133
public static void showIgnored(Element childElement, String reason) {
130134
String message = "Ignoring element " + getElementPathString(childElement) + " " + reason;
131-
//System.out.println(message);
135+
lg.debug(message);
132136
}
133137

134138
}
Lines changed: 112 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,25 @@
11
package org.vcell.util.bioregistry.ncbitaxon;
22

3+
import java.io.IOException;
4+
import java.time.Duration;
35
import java.util.Map;
6+
import java.net.URI;
7+
import java.net.http.HttpClient;
8+
import java.net.http.HttpRequest;
9+
import java.net.http.HttpResponse;
10+
import java.util.Random;
11+
import java.util.regex.Matcher;
12+
import java.util.regex.Pattern;
13+
import com.fasterxml.jackson.databind.JsonNode;
14+
import com.fasterxml.jackson.databind.ObjectMapper;
15+
import org.apache.logging.log4j.LogManager;
16+
import org.apache.logging.log4j.Logger;
17+
418

519
public class OrganismLookup {
620

21+
private static final Logger lg = LogManager.getLogger(OrganismLookup.class);
22+
723
public enum NameType {
824
COMMON,
925
SCIENTIFIC
@@ -12,26 +28,25 @@ public enum NameType {
1228
private static final String PREFIX = "http://bioregistry.io/ncbitaxon:";
1329

1430
private static final Map<String, String> COMMON_NAMES = Map.ofEntries(
15-
Map.entry("9606", "Human"),
16-
Map.entry("10090", "Mouse"),
17-
Map.entry("10116", "Rat"),
18-
Map.entry("7955", "Zebrafish"),
19-
Map.entry("7227", "Fruit fly"),
31+
Map.entry("9606", "human"),
32+
Map.entry("10090", "house mouse"),
33+
Map.entry("10116", "Norway rat"),
34+
Map.entry("7955", "zebrafish"),
35+
Map.entry("7227", "fruit fly"),
2036
Map.entry("6239", "Nematode worm"),
21-
Map.entry("10141", "Guinea pig"),
22-
Map.entry("9986", "Rabbit"),
23-
Map.entry("9615", "Dog"),
24-
Map.entry("9823", "Pig"),
37+
Map.entry("10141", "domestic guinea pig"),
38+
Map.entry("9986", "rabbit"),
39+
Map.entry("9615", "dog"),
2540
Map.entry("9544", "Rhesus monkey"),
26-
Map.entry("9685", "Cat"),
27-
Map.entry("9913", "Cow"),
28-
Map.entry("9031", "Chicken"),
29-
Map.entry("8364", "Xenopus frog"),
30-
Map.entry("28377", "Axolotl"),
31-
Map.entry("9825", "Mini pig"),
32-
Map.entry("9796", "Horse"),
33-
Map.entry("9915", "Sheep"),
34-
Map.entry("9940", "Goat")
41+
Map.entry("9685", "domestic cat"),
42+
Map.entry("9913", "domestic cattle"),
43+
Map.entry("9031", "chicken"),
44+
Map.entry("8364", "tropical clawed frog"),
45+
Map.entry("8296", "axolotl"),
46+
Map.entry("9825", "domestic pig"),
47+
Map.entry("9796", "horse"),
48+
Map.entry("9940", "sheep"),
49+
Map.entry("9925", "Goat")
3550
);
3651

3752
private static final Map<String, String> SCIENTIFIC_NAMES = Map.ofEntries(
@@ -44,20 +59,19 @@ public enum NameType {
4459
Map.entry("10141", "Cavia porcellus"),
4560
Map.entry("9986", "Oryctolagus cuniculus"),
4661
Map.entry("9615", "Canis lupus familiaris"),
47-
Map.entry("9823", "Sus scrofa"),
4862
Map.entry("9544", "Macaca mulatta"),
4963
Map.entry("9685", "Felis catus"),
5064
Map.entry("9913", "Bos taurus"),
5165
Map.entry("9031", "Gallus gallus"),
52-
Map.entry("8364", "Xenopus laevis"),
53-
Map.entry("28377", "Ambystoma mexicanum"),
66+
Map.entry("8364", "Xenopus tropicalis "),
67+
Map.entry("8296", "Ambystoma mexicanum"),
5468
Map.entry("9825", "Sus scrofa domesticus"),
5569
Map.entry("9796", "Equus caballus"),
56-
Map.entry("9915", "Ovis aries"),
57-
Map.entry("9940", "Capra hircus")
70+
Map.entry("9940", "Ovis aries"),
71+
Map.entry("9925", "Capra hircus")
5872
);
5973

60-
// 🧪 Get name from full URI
74+
// get name from full URI
6175
public static String getName(String fullUri, NameType type) {
6276
if (!fullUri.startsWith(PREFIX)) return "Unknown";
6377
String taxonId = fullUri.substring(PREFIX.length());
@@ -68,7 +82,7 @@ public static String getName(String fullUri, NameType type) {
6882
};
6983
}
7084

71-
// 🔁 Reverse lookup: get full URI from name
85+
// reverse lookup: get full URI from name
7286
public static String getUriFromName(String name, NameType type) {
7387
Map<String, String> sourceMap = switch (type) {
7488
case COMMON -> COMMON_NAMES;
@@ -81,4 +95,77 @@ public static String getUriFromName(String name, NameType type) {
8195
.findFirst()
8296
.orElse("Unknown");
8397
}
98+
99+
public static HttpResponse<String> fetchTaxonomyResponse(String taxonId) throws IOException, InterruptedException {
100+
String url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
101+
+ "?db=taxonomy&id=" + taxonId + "&retmode=json";
102+
103+
HttpClient client = HttpClient.newBuilder()
104+
.connectTimeout(Duration.ofSeconds(5))
105+
.build();
106+
107+
HttpRequest request = HttpRequest.newBuilder()
108+
.uri(URI.create(url))
109+
.timeout(Duration.ofSeconds(10))
110+
.header("Accept", "application/json")
111+
.build();
112+
113+
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
114+
return response;
115+
}
116+
117+
public static String parseTaxonomyName(String taxonId, String jsonBody) throws IOException {
118+
ObjectMapper mapper = new ObjectMapper();
119+
JsonNode root = mapper.readTree(jsonBody);
120+
JsonNode resultNode = root.path("result").path(taxonId);
121+
122+
if (resultNode.isMissingNode()) {
123+
return "Error: Taxon ID not found";
124+
}
125+
126+
String scientificName = resultNode.path("scientificname").asText("Unknown");
127+
String commonName = resultNode.path("commonname").asText(null);
128+
129+
String name = (commonName != null && !commonName.isEmpty())
130+
? scientificName + " (" + commonName + ")"
131+
: scientificName;
132+
return name;
133+
}
134+
135+
public static void verifyAllTaxonMappings() {
136+
Random random = new Random();
137+
138+
for (Map.Entry<String, String> entry : SCIENTIFIC_NAMES.entrySet()) {
139+
String taxonId = entry.getKey();
140+
String expectedScientific = SCIENTIFIC_NAMES.getOrDefault(taxonId, "(none)");
141+
String expectedCommon = COMMON_NAMES.getOrDefault(taxonId, "(none)");
142+
143+
try {
144+
HttpResponse<String> response = fetchTaxonomyResponse(taxonId);
145+
String result = parseTaxonomyName(taxonId, response.body());
146+
147+
lg.debug("Taxon ID: {}", taxonId);
148+
lg.debug(" Returned: {}", result);
149+
lg.debug(" Expected: {} ({})", expectedScientific, expectedCommon);
150+
lg.debug(""); // optional, consider removing if it adds no value
151+
152+
} catch (java.net.http.HttpTimeoutException e) {
153+
lg.warn("Timeout for Taxon ID: {}", taxonId, e);
154+
} catch (java.net.UnknownHostException e) {
155+
lg.warn("Host unreachable for Taxon ID: {}", taxonId, e);
156+
} catch (Exception e) {
157+
lg.warn("Error for Taxon ID: {}", taxonId, e);
158+
}
159+
160+
// wait 5 to 10 random seconds before calls so that we won't look like a denial of service attack
161+
int delaySeconds = 5 + random.nextInt(6); // 5–10 seconds
162+
try {
163+
Thread.sleep(delaySeconds * 1000L);
164+
} catch (InterruptedException e) {
165+
Thread.currentThread().interrupt();
166+
lg.warn("Interrupted during sleep");
167+
}
168+
}
169+
}
170+
84171
}

vcell-core/src/test/java/cbit/vcell/pathway/PathwaySearchTest.java

Lines changed: 42 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
import java.io.*;
2121
import java.net.*;
2222
import java.net.http.HttpClient;
23+
import java.net.http.HttpResponse;
24+
import java.net.http.HttpTimeoutException;
2325
import java.nio.charset.StandardCharsets;
2426
import java.nio.file.Files;
2527
import java.nio.file.Path;
@@ -30,7 +32,10 @@
3032
import java.time.Duration;
3133
import java.util.*;
3234
import cbit.util.xml.XmlUtil;
35+
import org.vcell.util.bioregistry.ncbitaxon.OrganismLookup;
3336
import java.util.stream.Collectors;
37+
import org.apache.logging.log4j.LogManager;
38+
import org.apache.logging.log4j.Logger;
3439

3540
import static org.junit.jupiter.api.Assertions.*;
3641
import static org.vcell.util.network.ClientDownloader.downloadBytes;
@@ -39,6 +44,8 @@
3944
@Tag("Fast")
4045
public class PathwaySearchTest {
4146

47+
private static final Logger lg = LogManager.getLogger(PathwaySearchTest.class);
48+
4249
private static final Namespace RDF_NS = Namespace.getNamespace("rdf",
4350
"http://www.w3.org/1999/02/22-rdf-syntax-ns#");
4451
private static final Namespace BP_NS = Namespace.getNamespace("bp",
@@ -69,7 +76,7 @@ public static void tearDown() {
6976
public void pathwayDownloadTest() throws MalformedURLException {
7077
String pathwayId = "5683177"; // Reactome pathway ID
7178
pathwayDownload(pathwayId);
72-
System.out.println("pathwayDownloadTest - done");
79+
lg.debug("pathwayDownloadTest - done");
7380
}
7481

7582

@@ -83,7 +90,7 @@ public void searchTest() throws IOException {
8390
String uri = "https://www.pathwaycommons.org/pc2/search?"
8491
+ "q=" + encodedQ
8592
+ "&type=pathway";
86-
System.out.println("Query URI: " + uri);
93+
lg.debug("Query URI: " + uri);
8794

8895
HttpURLConnection conn = (HttpURLConnection) new URL(uri).openConnection();
8996
conn.setRequestProperty("Accept", "application/xml");
@@ -105,9 +112,33 @@ public void searchTest() throws IOException {
105112
List<org.jdom2.Element> hits = root.getChildren("searchHit");
106113
assertFalse(hits.isEmpty(), "No <searchHit> elements found");
107114
assertEquals(100, hits.size(), "Expected 100 <searchHit> elements, but found " + hits.size());
108-
System.out.println("searchTest - done");
115+
lg.debug("searchTest - done");
116+
}
117+
118+
@Test
119+
public void fetchTaxonomyNameFromIdTest() {
120+
String taxonId = "9940"; // Ovis aries (sheep)
121+
122+
try {
123+
HttpResponse<String> response = OrganismLookup.fetchTaxonomyResponse(taxonId);
124+
assertEquals(200, response.statusCode(), "Unexpected HTTP status");
125+
126+
String result = OrganismLookup.parseTaxonomyName(taxonId, response.body());
127+
assertEquals("Ovis aries (sheep)", result, "Incorrect taxonomy name");
128+
129+
} catch (HttpTimeoutException e) {
130+
fail("Timeout occurred while fetching taxonomy data");
131+
} catch (UnknownHostException e) {
132+
fail("Host unreachable: check network or endpoint");
133+
} catch (IOException e) {
134+
fail("I/O error during taxonomy fetch: " + e.getMessage());
135+
} catch (InterruptedException e) {
136+
Thread.currentThread().interrupt();
137+
fail("Request interrupted");
138+
}
109139
}
110140

141+
111142
// --- Utilities ---------------------------------------------------------------------------------------------------
112143

113144
// wrapper to get a jdom2 Document
@@ -287,10 +318,10 @@ private static void pathwayDownloadToFile(String pathwayId, String destinationDi
287318
Path outputFile = dirPath.resolve(pathwayId + ".xml");
288319
java.nio.file.Files.write(outputFile, contentString.getBytes(StandardCharsets.UTF_8));
289320

290-
System.out.println("Saved BioPAX to: " + outputFile.toAbsolutePath());
321+
lg.debug("Saved BioPAX to: " + outputFile.toAbsolutePath());
291322
}
292323
catch (Exception e) {
293-
System.err.println("Failed to download or save pathway " + pathwayId);
324+
lg.warn("Failed to download or save pathway " + pathwayId);
294325
e.printStackTrace();
295326
}
296327
}
@@ -323,7 +354,7 @@ private static Document filterPathwayByTargetId(String pathwayId, String targetI
323354

324355
} catch (IOException e) {
325356
// fallback: download from Pathway Commons if local file is missing or unreadable
326-
System.err.println("Failed to read local file — falling back to download");
357+
lg.warn("Failed to read local file — falling back to download");
327358
readSuccess = false;
328359
}
329360
try {
@@ -362,10 +393,9 @@ private static void writeFilteredPathway(Document doc, String pathwayId, String
362393
try (FileWriter writer = new FileWriter(outputPath.toFile())) {
363394
outputter.output(doc, writer);
364395
}
365-
System.out.println("Filtered pathway written to: " + outputPath.toAbsolutePath());
396+
lg.debug("Filtered pathway written to: " + outputPath.toAbsolutePath());
366397
} catch (IOException e) {
367-
System.err.println("Failed to write filtered pathway to file");
368-
e.printStackTrace();
398+
lg.warn("Failed to write filtered pathway to file", e);
369399
}
370400
}
371401

@@ -379,17 +409,17 @@ public static void main(String[] args) {
379409
// pathwayDownloadToFile(pathwayId, null);
380410
Document filteredDoc = filterPathwayByTargetId(pathwayId, targetId, null);
381411
if(filteredDoc == null) {
382-
System.out.println("Filtered document is null");
412+
lg.debug("Filtered document is null");
383413
System.exit(1);
384414
}
385415
// output the result as a pretty‐printed XML String
386416
XMLOutputter out = new XMLOutputter(Format.getPrettyFormat());
387417
String filteredString = out.outputString(filteredDoc);
388-
System.out.println(filteredString);
418+
lg.debug(filteredString);
389419

390420
// optional, save it
391421
writeFilteredPathway( filteredDoc, pathwayId, targetId, null);
392-
System.out.println("Done work on pathway id " + pathwayId);
422+
lg.debug("Done work on pathway id " + pathwayId);
393423
}
394424

395425

0 commit comments

Comments
 (0)