Skip to content

Commit 72b4f0c

Browse files
xml2json: Fix encoding, enhance XML-to-JSON transformation (#2632)
* **refactor(xml-json): enhance XML-to-JSON transformation & improve encoding handling** - Applied secure XML processing settings with logging for unsupported features. - Improved XML-to-JSON transformation flow in `Xml2JsonInterceptor`, ensuring content type alignment and better exception handling. - Added comprehensive charset handling, including support for `ISO-8859-1` and UTF-8 content types. - Expanded test coverage for encoding scenarios, ensuring transformation accuracy across charset variations. - Updated documentation and comments for clarity and consistency in related classes (`Header`, `MimeType`, etc.). * **refactor(xml-json): improve encoding handling, XML-to-JSON transformation, and tests** - Replaced `StandardCharsets.ISO_8859_1` with static import for consistency. - Adjusted buffer size in `XMLEncodingUtil` for better performance in encoding extraction. - Enhanced exception handling in `Xml2JsonInterceptor` with improved logging for unsupported encoding errors. - Renamed constants and methods in tests (`ÜÖÜÖÜÖ` to `UMLAUTS`) for better clarity. - Updated and aligned Javadoc comments. - Simplified test method names and expanded coverage for encoding scenarios. * docs(header): update Javadoc for clarity, removing incorrect thread-safety claim * refactor(header): specify locale for charset normalization in `Content-Type` parsing * fix --------- Co-authored-by: Christian Gördes <christian.goerdes@outlook.de>
1 parent b48d3ba commit 72b4f0c

10 files changed

Lines changed: 803 additions & 643 deletions

File tree

core/src/main/java/com/predic8/membrane/core/http/Header.java

Lines changed: 559 additions & 540 deletions
Large diffs are not rendered by default.

core/src/main/java/com/predic8/membrane/core/http/MimeType.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ public class MimeType {
3939
public static final String TEXT_XML = "text/xml";
4040
public static final String TEXT_HTML = "text/html";
4141
public static final String TEXT_XML_UTF8 = TEXT_XML + ";charset=UTF-8";
42+
public static final String TEXT_XML_ISO_8859_1 = TEXT_XML + ";charset=ISO-8859-1";
4243

4344
public static final String TEXT_HTML_UTF8 = "text/html;charset=UTF-8";
4445
public static final String TEXT_PLAIN = "text/plain";

core/src/main/java/com/predic8/membrane/core/interceptor/xml/Xml2JsonInterceptor.java

Lines changed: 73 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -18,34 +18,57 @@
1818
import com.predic8.membrane.core.exchange.*;
1919
import com.predic8.membrane.core.http.*;
2020
import com.predic8.membrane.core.interceptor.*;
21+
import com.predic8.membrane.core.util.xml.*;
2122
import org.json.*;
2223
import org.slf4j.*;
23-
import org.w3c.dom.*;
2424

25-
import javax.xml.*;
26-
import javax.xml.parsers.*;
27-
import javax.xml.transform.*;
28-
import javax.xml.transform.dom.*;
29-
import javax.xml.transform.stream.*;
3025
import java.io.*;
3126

3227
import static com.predic8.membrane.core.exceptions.ProblemDetails.*;
33-
import static com.predic8.membrane.core.interceptor.Interceptor.Flow.REQUEST;
28+
import static com.predic8.membrane.core.http.MimeType.*;
29+
import static com.predic8.membrane.core.interceptor.Interceptor.Flow.*;
3430
import static com.predic8.membrane.core.interceptor.Outcome.*;
31+
import static com.predic8.membrane.core.interceptor.Outcome.ABORT;
3532
import static java.nio.charset.StandardCharsets.*;
36-
import static javax.xml.transform.OutputKeys.*;
37-
3833

3934
/**
40-
* @description If enabled converts body content from xml to json.
41-
* @explanation Can be used for both request and response. Xml file assumed to be in UTF-8. If input is invalid it returns
42-
* empty json object.
35+
* @description Converts an XML message body to JSON.
36+
* <p>
37+
* The interceptor performs a generic XML-to-JSON transformation using a
38+
* structural mapping of XML elements and attributes to JSON objects.
39+
* While this works well for simple and data-oriented XML, it has inherent
40+
* limitations and challenges.
41+
* </p>
42+
*
43+
* <p>
44+
* In particular:
45+
* <ul>
46+
* <li>XML attributes and elements are both mapped to JSON properties, which
47+
* can lead to ambiguities.</li>
48+
* <li>Element order, mixed content, and namespaces may not be preserved
49+
* in a meaningful way.</li>
50+
* <li>Repeated elements are heuristically converted into JSON arrays,
51+
* which may not match the intended domain model.</li>
52+
* </ul>
53+
* </p>
54+
*
55+
* <p>
56+
* This interceptor is intended for integration scenarios where XML is used
57+
* as a transport format and the JSON representation is primarily consumed
58+
* by applications that do not require full fidelity of the original XML
59+
* structure.
60+
* </p>
61+
*
62+
* <p>
63+
* For complex XML schemas or contract-driven integrations, a dedicated
64+
* transformation using a template, XSLT or a schema-aware mapping is recommended.
65+
* </p>
4366
* @topic 2. Enterprise Integration Patterns
4467
*/
45-
@MCElement(name="xml2Json")
68+
@MCElement(name = "xml2Json")
4669
public class Xml2JsonInterceptor extends AbstractInterceptor {
4770

48-
private static final Logger log = LoggerFactory.getLogger(Xml2JsonInterceptor.class.getName());
71+
private static final Logger log = LoggerFactory.getLogger(Xml2JsonInterceptor.class);
4972

5073
@Override
5174
public String getShortDescription() {
@@ -54,84 +77,63 @@ public String getShortDescription() {
5477

5578
@Override
5679
public Outcome handleRequest(Exchange exc) {
57-
try {
58-
return handleInternal(exc.getRequest());
59-
} catch (Exception e) {
60-
log.error("", e);
61-
internal(router.getConfiguration().isProduction(),getDisplayName())
62-
.flow(REQUEST)
63-
.detail("Could not transform XML to JSON!")
64-
.exception(e)
65-
.buildAndSetResponse(exc);
66-
return ABORT;
67-
}
80+
return handleInternal(exc, REQUEST);
6881
}
6982

7083
@Override
7184
public Outcome handleResponse(Exchange exc) {
72-
try {
73-
return handleInternal(exc.getResponse());
74-
} catch (Exception e) {
75-
internal(router.getConfiguration().isProduction(),getDisplayName())
76-
.flow(Flow.RESPONSE)
77-
.detail("Could not return WSDL document!")
78-
.exception(e)
79-
.buildAndSetResponse(exc);
80-
return ABORT;
81-
}
85+
return handleInternal(exc, RESPONSE);
8286
}
8387

84-
private Outcome handleInternal(Message msg) throws Exception {
85-
if(!msg.isXML()){
88+
private Outcome handleInternal(Exchange exc, Flow flow) {
89+
Message msg = exc.getMessage(flow);
90+
if (!msg.isXML()) {
8691
return CONTINUE;
8792
}
88-
89-
if(msg.getHeader().getContentEncoding() != null){
90-
msg.setBodyContent(xml2json(msg.getBodyAsStreamDecoded(), msg.getHeader().getContentEncoding()));
91-
}
92-
else{
93-
msg.setBodyContent(xml2json(loadXMLFromStream(msg.getBodyAsStreamDecoded())));
93+
try {
94+
msg.setBodyContent(xml2json(getBodyAsString(msg)));
95+
msg.getHeader().setContentType(APPLICATION_JSON_UTF8);
96+
return CONTINUE;
97+
} catch (UnsupportedEncodingException e) {
98+
handleException(exc, flow, e, "Unsupported encoding: " + e.getMessage());
99+
} catch (Exception e) {
100+
handleException(exc, flow, e, null);
94101
}
95-
msg.getHeader().setContentType(MimeType.APPLICATION_JSON_UTF8);
96-
97-
return CONTINUE;
102+
return ABORT;
98103
}
99104

100-
private byte[] xml2json(InputStream body, String encoding) throws UnsupportedEncodingException {
101-
return XML.toJSONObject(new InputStreamReader(body, encoding)).toString().getBytes(UTF_8);
105+
private static String getBodyAsString(Message msg) throws IOException {
106+
if (msg.getHeader().getCharset() != null) return msg.getBodyAsStringDecoded();
107+
108+
// Conversion is expensive but needed to get encoding from XML
109+
// because org.json.XML ignores the encoding specified in the XML prolog
110+
byte[] body = msg.getBody().getContent();
111+
var fromProlog = XMLEncodingUtil.getEncodingFromXMLProlog(body);
112+
return new String(body, fromProlog != null ? fromProlog : UTF_8.name());
102113
}
103114

115+
104116
private byte[] xml2json(String xml) {
117+
// In org.json.XML the encoding is skipped, so xml encoding is always ignored: x.skipPast("?>");
105118
return XML.toJSONObject(xml).toString().getBytes(UTF_8);
106119
}
107120

108-
public static String loadXMLFromStream(InputStream stream) throws Exception
109-
{
110-
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
111-
factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
112-
return documentToString(factory.newDocumentBuilder().parse(stream));
113-
}
114-
115-
public static String documentToString(Document doc) {
116-
try {
117-
StringWriter sw = new StringWriter();
118-
TransformerFactory tf = TransformerFactory.newInstance();
119-
tf.setAttribute(XMLConstants.ACCESS_EXTERNAL_DTD, "");
120-
tf.setAttribute(XMLConstants.ACCESS_EXTERNAL_STYLESHEET, "");
121-
Transformer transformer = tf.newTransformer();
122-
transformer.setOutputProperty(OMIT_XML_DECLARATION, "no");
123-
transformer.setOutputProperty(METHOD, "xml");
124-
transformer.setOutputProperty(INDENT, "no");
125-
transformer.setOutputProperty(ENCODING, "UTF-8");
126-
transformer.transform(new DOMSource(doc), new StreamResult(sw));
127-
return sw.toString();
128-
} catch (Exception ex) {
129-
throw new RuntimeException("Error converting to String", ex);
121+
private void handleException(Exchange exc, Flow flow, Exception e, String msg) {
122+
if (msg == null) {
123+
msg = "Could not transform XML to JSON: " + e.getMessage();
124+
log.info(msg, e);
125+
log.debug("", e);
130126
}
127+
internal(router.getConfiguration().isProduction(), getDisplayName()).flow(flow).status(flow == REQUEST ? 400 : 500)
128+
.detail(msg)
129+
.exception(e)
130+
.topLevel("charset-from-header", exc.getMessage(flow).getHeader().getCharset())
131+
.stacktrace(false)
132+
.buildAndSetResponse(exc);
131133
}
132134

133135
@Override
134136
public String getDisplayName() {
135137
return "xml 2 json";
136138
}
137-
}
139+
}
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
package com.predic8.membrane.core.util.xml;
2+
3+
import java.nio.charset.*;
4+
import java.util.*;
5+
import java.util.regex.*;
6+
7+
import static java.nio.charset.StandardCharsets.ISO_8859_1;
8+
9+
public class XMLEncodingUtil {
10+
11+
// XML declaration must be ASCII-compatible
12+
private static final Pattern XML_DECL_PATTERN =
13+
Pattern.compile("^\\s*<\\?xml\\s+([^?]*?)\\?>",
14+
Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
15+
16+
private static final Pattern ENCODING_ATTR_PATTERN =
17+
Pattern.compile("(?i)\\bencoding\\s*=\\s*(['\"])([^'\"]+)\\1");
18+
19+
/**
20+
* Extracts encoding from an XML prolog using raw bytes.
21+
* XML spec guarantees the prolog is ASCII-compatible, so we
22+
* decode only a small prefix as ISO-8859-1.
23+
* @param bytes XML document bytes
24+
* @return encoding name (e.g. UTF-8, ISO-8859-1) or null if absent
25+
*/
26+
public static String getEncodingFromXMLProlog(byte[] bytes) {
27+
if (bytes == null || bytes.length == 0) return null;
28+
29+
int offset = 0;
30+
31+
// UTF-8 BOM
32+
if (bytes.length >= 3
33+
&& (bytes[0] & 0xFF) == 0xEF
34+
&& (bytes[1] & 0xFF) == 0xBB
35+
&& (bytes[2] & 0xFF) == 0xBF) {
36+
offset = 3;
37+
}
38+
39+
// XML declaration must appear at the start (after BOM + whitespace)
40+
int max = Math.min(bytes.length, offset + 1024);
41+
42+
// ISO-8859-1 preserves byte values 1:1 → safe for ASCII parsing
43+
String prefix = new String(bytes, offset, max - offset, ISO_8859_1);
44+
45+
Matcher decl = XML_DECL_PATTERN.matcher(prefix);
46+
if (!decl.find()) return null;
47+
48+
String declBody = decl.group(1);
49+
Matcher enc = ENCODING_ATTR_PATTERN.matcher(declBody);
50+
if (!enc.find()) return null;
51+
52+
return enc.group(2).trim().toUpperCase(Locale.ROOT);
53+
}
54+
}

core/src/main/java/com/predic8/membrane/core/util/xml/XMLUtil.java

Lines changed: 41 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
import com.predic8.membrane.core.http.*;
1717
import org.jetbrains.annotations.*;
18+
import org.slf4j.*;
1819
import org.w3c.dom.*;
1920
import org.xml.sax.*;
2021

@@ -24,44 +25,73 @@
2425
import javax.xml.transform.stream.*;
2526
import java.io.*;
2627
import java.util.*;
28+
import java.util.concurrent.atomic.*;
29+
import java.util.regex.*;
2730

2831
import static javax.xml.XMLConstants.*;
32+
import static javax.xml.transform.OutputKeys.*;
2933

3034
public class XMLUtil {
3135

32-
// TransformerFactory is *not* specified as thread-safe.
33-
// We keep one instance per thread. See:
34-
// https://docs.oracle.com/javase/8/docs/api/javax/xml/transform/TransformerFactory.html
35-
private static final ThreadLocal<TransformerFactory> TF = ThreadLocal.withInitial(() -> {
36+
private static final Logger log = LoggerFactory.getLogger(XMLUtil.class);
37+
38+
private static final AtomicBoolean loggedSecureProcessing = new AtomicBoolean();
39+
private static final AtomicBoolean loggedExternalAccess = new AtomicBoolean();
40+
41+
42+
// TransformerFactory is not specified as thread-safe and is mutable (features/attributes).
43+
// Use one instance per thread and create a fresh Transformer per transform (Transformer is not thread-safe).
44+
private static final ThreadLocal<TransformerFactory> SAFE_TRANSFORMER_FACTORY = ThreadLocal.withInitial(() -> {
3645
TransformerFactory f = TransformerFactory.newInstance();
46+
47+
// Best-effort hardening. Some implementations may not support all flags.
3748
try {
3849
// Enable secure processing (limits entity expansion etc.)
3950
f.setFeature(FEATURE_SECURE_PROCESSING, true);
4051
} catch (TransformerConfigurationException | TransformerFactoryConfigurationError e) {
41-
// Log if you want, but do not fail hard because of missing feature
52+
if (loggedSecureProcessing.compareAndSet(false, true)) {
53+
log.warn("Could not enable secure XML processing (FEATURE_SECURE_PROCESSING): {}", e.getMessage());
54+
}
4255
}
4356

4457
try {
4558
// Disallow access to external DTDs and stylesheets (JAXP 1.5+)
4659
f.setAttribute(ACCESS_EXTERNAL_DTD, "");
4760
f.setAttribute(ACCESS_EXTERNAL_STYLESHEET, "");
48-
} catch (IllegalArgumentException ignored) {
61+
f.setAttribute(ACCESS_EXTERNAL_SCHEMA, "");
62+
} catch (IllegalArgumentException e) {
4963
// Attributes not supported by all implementations
64+
if (loggedExternalAccess.compareAndSet(false, true)) {
65+
log.warn("Could not disable external XML access (ACCESS_EXTERNAL_DTD/ACCESS_EXTERNAL_STYLESHEET): {}", e.getMessage());
66+
}
5067
}
5168

5269
return f;
5370
});
5471

72+
/**
73+
* Returns a {@link TransformerFactory} instance with XML hardening enabled on a
74+
* best-effort basis.
75+
* The returned factory is obtained from a thread-local cache because
76+
* {@link TransformerFactory} is mutable and not specified as thread-safe.
77+
* A fresh {@link javax.xml.transform.Transformer} must be created for each
78+
* transformation.
79+
*
80+
* @return a thread-local {@link TransformerFactory} with best-effort XML security hardening applied
81+
*/
82+
public static TransformerFactory newHardenedBestEffortTransformerFactory() {
83+
return SAFE_TRANSFORMER_FACTORY.get();
84+
}
5585

5686
public static String xmlNode2String(Node node) throws TransformerException {
5787
if (node == null) {
5888
return "";
5989
}
6090

61-
Transformer tf = TF.get().newTransformer();
62-
tf.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
63-
tf.setOutputProperty(OutputKeys.METHOD, "xml");
64-
tf.setOutputProperty(OutputKeys.INDENT, "yes");
91+
Transformer tf = SAFE_TRANSFORMER_FACTORY.get().newTransformer();
92+
tf.setOutputProperty(OMIT_XML_DECLARATION, "yes");
93+
tf.setOutputProperty(METHOD, "xml");
94+
tf.setOutputProperty(INDENT, "yes");
6595

6696
StringWriter writer = new StringWriter();
6797
tf.transform(new DOMSource(node), new StreamResult(writer));
@@ -74,7 +104,7 @@ public static QName groovyToJavaxQName(groovy.namespace.QName qName) {
74104

75105
/**
76106
* For XML processing sometimes an InputSource is needed.
77-
* @param msg
107+
* @param msg Message with body
78108
* @return InputSource of the message body
79109
*/
80110
public static @NotNull InputSource getInputSource(Message msg) {

0 commit comments

Comments
 (0)