From 22bd1f00426ba50bd3b269c8add44fe7c17af5f8 Mon Sep 17 00:00:00 2001 From: Jim Schaff Date: Mon, 4 May 2026 23:52:13 -0400 Subject: [PATCH 1/3] Use UTF-8 explicitly when reading SBML/SED-ML XML Six call sites in the SBML and SED-ML import paths were reading XML input via Charset.defaultCharset() (or naked .getBytes() / no-charset InputStreamReader), which uses the JVM platform default encoding. On a non-UTF-8 default JVM (e.g. Cp1252 on legacy Windows), a UTF-8 SBML file with non-ASCII chars (Greek letters, en-dashes, etc.) gets mojibake'd before the XML parser sees it. XML 1.0 says: in absence of an declaration the parser MUST treat input as UTF-8 (or detect via BOM). We were using the platform's preferred charset instead. This is a likely contributor to the corrupted reaction-name characters observed in BioModels 311226221 and 311875206 (where '14-3-3' appears as '1433' or '1433' in the cached VCML). Fixed sites: - SBMLImporter.readSbmlDocument(File): line-based read of SBML file - SedMLImporter: in-memory SBML String -> InputStream for re-import - jlibsedml.Libsedml.readDocument(InputStream, encoding): null-encoding fallback was platform default - jlibsedml.XpathGeneratorHelper: model String -> bytes -> XML parse - vcell.sybil.util.xml.DOMUtil.parse(String): String -> bytes -> XML parse - jlibsedml.modelsupport.KisaoTermParser: bundled .obo resource read The render-namespace string-munge workaround in SBMLImporter (lines 1936-1948) is unchanged in this commit. Whether it is still required against current jsbml is a separate question, gated on a corpus-driven test using the sys-bio/temp-biomodels SBML/OMEX archive. Co-Authored-By: Claude Opus 4.7 (1M context) --- vcell-core/src/main/java/org/jlibsedml/Libsedml.java | 6 +++--- .../src/main/java/org/jlibsedml/XpathGeneratorHelper.java | 3 ++- .../java/org/jlibsedml/modelsupport/KisaoTermParser.java | 3 ++- .../src/main/java/org/vcell/sbml/vcell/SBMLImporter.java | 3 +-- vcell-core/src/main/java/org/vcell/sedml/SedMLImporter.java | 4 ++-- .../src/main/java/org/vcell/sybil/util/xml/DOMUtil.java | 3 ++- 6 files changed, 12 insertions(+), 10 deletions(-) diff --git a/vcell-core/src/main/java/org/jlibsedml/Libsedml.java b/vcell-core/src/main/java/org/jlibsedml/Libsedml.java index 3bf97cea85..bf00e30755 100644 --- a/vcell-core/src/main/java/org/jlibsedml/Libsedml.java +++ b/vcell-core/src/main/java/org/jlibsedml/Libsedml.java @@ -11,7 +11,7 @@ import java.io.InputStream; import java.io.OutputStreamWriter; import java.io.StringReader; -import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; import java.util.zip.ZipEntry; @@ -100,8 +100,8 @@ public static SedMLDocument readDocument(File file) throws XMLException { */ public static SedMLDocument readDocument(InputStream is, String encoding) throws XMLException, IOException { if(encoding == null) { - encoding = Charset.defaultCharset().name(); - } + encoding = StandardCharsets.UTF_8.name(); + } List lines = IOUtils.readLines(is, encoding); String content = StringUtils.join(lines, "\n"); return readDocumentFromString(content); diff --git a/vcell-core/src/main/java/org/jlibsedml/XpathGeneratorHelper.java b/vcell-core/src/main/java/org/jlibsedml/XpathGeneratorHelper.java index d61cad14c6..5ca73c1561 100644 --- a/vcell-core/src/main/java/org/jlibsedml/XpathGeneratorHelper.java +++ b/vcell-core/src/main/java/org/jlibsedml/XpathGeneratorHelper.java @@ -3,6 +3,7 @@ import java.io.ByteArrayInputStream; import java.io.IOException; import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Iterator; import java.util.List; @@ -51,7 +52,7 @@ public boolean addIdentifiersAsDataGenerators(final AbstractTask task, final Str String modelStrRep = modelResolver.getModelXMLFor(modelFound.getSourceURI()); if (modelStrRep == null) return false; - Document doc = utils.readDoc(new ByteArrayInputStream(modelStrRep.getBytes())); + Document doc = utils.readDoc(new ByteArrayInputStream(modelStrRep.getBytes(StandardCharsets.UTF_8))); List configs = new ArrayList<>(); for (IdName idn : idNameList) { diff --git a/vcell-core/src/main/java/org/jlibsedml/modelsupport/KisaoTermParser.java b/vcell-core/src/main/java/org/jlibsedml/modelsupport/KisaoTermParser.java index 2020ed8d33..9a9f8b52b5 100644 --- a/vcell-core/src/main/java/org/jlibsedml/modelsupport/KisaoTermParser.java +++ b/vcell-core/src/main/java/org/jlibsedml/modelsupport/KisaoTermParser.java @@ -7,6 +7,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -34,7 +35,7 @@ class KisaoTermParser { KisaoOntology parse() { InputStream is2 = KisaoTermParser.class.getClassLoader() .getResourceAsStream(Kisao_OBO); - BufferedReader isr = new BufferedReader(new InputStreamReader(is2)); + BufferedReader isr = new BufferedReader(new InputStreamReader(is2, StandardCharsets.UTF_8)); String line = null; boolean inPreamble = true; boolean inState = false; diff --git a/vcell-core/src/main/java/org/vcell/sbml/vcell/SBMLImporter.java b/vcell-core/src/main/java/org/vcell/sbml/vcell/SBMLImporter.java index 4dd853f051..cbe15b8506 100644 --- a/vcell-core/src/main/java/org/vcell/sbml/vcell/SBMLImporter.java +++ b/vcell-core/src/main/java/org/vcell/sbml/vcell/SBMLImporter.java @@ -83,7 +83,6 @@ import javax.xml.stream.XMLStreamException; import java.beans.PropertyVetoException; import java.io.*; -import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.*; import java.util.Map.Entry; @@ -1931,7 +1930,7 @@ private SBMLDocument readSbmlDocument(File sbmlFile){ final String defaultErrorPrefix = "Unable to read SBML file"; try { // Read SBML model into libSBML SBMLDocument and create an SBML model - List readLines = FileUtils.readLines(sbmlFile, Charset.defaultCharset()); + List readLines = FileUtils.readLines(sbmlFile, StandardCharsets.UTF_8); StringBuilder sb = new StringBuilder(); //Temporary fix for org.sbml.jsbml.xml.parsers.RenderParser.processEndDocument(SBMLDocument sbmlDocument) //throws NPE when " Date: Mon, 4 May 2026 23:55:50 -0400 Subject: [PATCH 2/3] Test SBML import preserves UTF-8 non-ASCII chars in names Imports a minimal SBML L2V4 document containing U+2013 EN DASH and U+03BC GREEK SMALL LETTER MU in reaction and species name attributes, and asserts the resulting BioModel preserves the chars byte-for-byte in getSbmlName(). Tagged Fast. Exercises both the File path (readSbmlDocument(File), which used to read with Charset.defaultCharset()) and the InputStream path. On a UTF-8-default JVM both paths look equivalent, but the test documents expected behavior and catches regressions if either path is changed back to platform-default decoding. A complementary CI job that forks a Cp1252-default JVM is the follow-up that demonstrates the fix matters cross-platform. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../org/vcell/sbml/SBMLImportCharsetTest.java | 110 ++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 vcell-core/src/test/java/org/vcell/sbml/SBMLImportCharsetTest.java diff --git a/vcell-core/src/test/java/org/vcell/sbml/SBMLImportCharsetTest.java b/vcell-core/src/test/java/org/vcell/sbml/SBMLImportCharsetTest.java new file mode 100644 index 0000000000..555b26d33b --- /dev/null +++ b/vcell-core/src/test/java/org/vcell/sbml/SBMLImportCharsetTest.java @@ -0,0 +1,110 @@ +package org.vcell.sbml; + +import cbit.util.xml.VCLogger; +import cbit.util.xml.VCLoggerException; +import cbit.vcell.biomodel.BioModel; +import cbit.vcell.model.ReactionStep; +import cbit.vcell.model.SpeciesContext; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.vcell.sbml.vcell.SBMLImporter; + +import java.io.File; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +/** + * Verifies SBML import reads non-ASCII attribute values byte-for-byte from a + * UTF-8 source. The two reaction-name patterns chosen here (en-dash U+2013 and + * Greek mu U+03BC) are common in scientific notation and would mojibake under + * the previous {@code Charset.defaultCharset()} read on a non-UTF-8 JVM. + */ +@Tag("Fast") +public class SBMLImportCharsetTest { + + private static class CapturingVCLogger extends VCLogger { + @Override public boolean hasMessages() { return false; } + @Override public void sendAllMessages() { } + @Override public void sendMessage(Priority p, ErrorType et, String message) throws VCLoggerException { + if (p == Priority.HighPriority) { + throw new VCLoggerException(p + " " + et + ": " + message); + } + } + } + + private static final String SBML_WITH_NON_ASCII = + "\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " 1.0\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n"; + + @Test + public void importsUtf8ReactionAndSpeciesNames() throws Exception { + Path tmp = Files.createTempFile("vcell-charset-test-", ".xml"); + try { + Files.write(tmp, SBML_WITH_NON_ASCII.getBytes(StandardCharsets.UTF_8)); + + SBMLImporter importer = new SBMLImporter(tmp.toAbsolutePath().toString(), new CapturingVCLogger(), false); + BioModel bioModel = importer.getBioModel(); + assertNotNull(bioModel); + + ReactionStep r1 = null; + for (ReactionStep rs : bioModel.getModel().getReactionSteps()) { + if ("r1".equals(rs.getName())) { r1 = rs; break; } + } + assertNotNull(r1, "expected reaction with id 'r1' in imported model"); + assertEquals("k_14–3–3", r1.getSbmlName(), + "reaction sbmlName must preserve U+2013 EN DASH characters"); + + SpeciesContext s1 = null; + for (SpeciesContext sc : bioModel.getModel().getSpeciesContexts()) { + if ("s1".equals(sc.getName())) { s1 = sc; break; } + } + assertNotNull(s1, "expected species with id 's1' in imported model"); + assertEquals("μ-prot", s1.getSbmlName(), + "species sbmlName must preserve U+03BC GREEK SMALL LETTER MU"); + } finally { + Files.deleteIfExists(tmp); + } + } + + @Test + public void inputStreamPathPreservesUtf8() throws Exception { + try (java.io.ByteArrayInputStream in = + new java.io.ByteArrayInputStream(SBML_WITH_NON_ASCII.getBytes(StandardCharsets.UTF_8))) { + SBMLImporter importer = new SBMLImporter(in, new CapturingVCLogger(), false); + BioModel bioModel = importer.getBioModel(); + assertNotNull(bioModel); + + ReactionStep r1 = null; + for (ReactionStep rs : bioModel.getModel().getReactionSteps()) { + if ("r1".equals(rs.getName())) { r1 = rs; break; } + } + assertNotNull(r1); + assertEquals("k_14–3–3", r1.getSbmlName()); + } + } +} From d91068561a20759b64d3ae782d1e9b23d7624edb Mon Sep 17 00:00:00 2001 From: Jim Schaff Date: Tue, 5 May 2026 09:04:28 -0400 Subject: [PATCH 3/3] Disable DTD and external entity processing in DOMUtil parser CodeQL flagged the existing DOMUtil.parse(String) call site for CWE-611 (XXE) when the prior commit changed the line. Caller is the Pathway Commons HTTP error response parser, which has no legitimate need for DTDs or external entity expansion. Apply the OWASP-recommended JAXP hardening features at builder init time. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/main/java/org/vcell/sybil/util/xml/DOMUtil.java | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vcell-core/src/main/java/org/vcell/sybil/util/xml/DOMUtil.java b/vcell-core/src/main/java/org/vcell/sybil/util/xml/DOMUtil.java index e5e6c85655..9ab0f591c2 100644 --- a/vcell-core/src/main/java/org/vcell/sybil/util/xml/DOMUtil.java +++ b/vcell-core/src/main/java/org/vcell/sybil/util/xml/DOMUtil.java @@ -43,6 +43,13 @@ public class DOMUtil { protected static void initBuilder() throws ParserConfigurationException { if(builder == null) { DocumentBuilderFactory factory = new DocumentBuilderFactoryImpl(); + // CWE-611: disable DTD and external entity processing + factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); + factory.setFeature("http://xml.org/sax/features/external-general-entities", false); + factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); + factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); + factory.setXIncludeAware(false); + factory.setExpandEntityReferences(false); builder = factory.newDocumentBuilder(); } }