Skip to content

Commit 85acf85

Browse files
committed
Fix processing comments in XML
When collecting element text (text and CDATA), skip over comments. Unsupported nodes (entity references, processing instructions) are rejected.
1 parent 1d2a9f9 commit 85acf85

3 files changed

Lines changed: 131 additions & 96 deletions

File tree

base/src/xml.ext.c

Lines changed: 62 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -68,56 +68,70 @@ static unsigned char* copy_with_xml_escape(unsigned char *dst, B_str src, int es
6868
return dst;
6969
}
7070

71-
// Helper function to collect text from consecutive TEXT and CDATA nodes
72-
// Returns the combined string and updates the node pointer to the first non-text node
73-
// Note: cur_ptr is passed by reference (pointer to pointer) so we can update the caller's pointer
74-
// to skip past all consumed text/CDATA nodes
71+
// Collect character data (text and CDATA) starting at *cur_ptr, up to but not
72+
// including the next element node. Comments are skipped over transparently, so
73+
// text on either side of a comment is concatenated (including whitespace).
74+
// Unsupported node types are rejected.
75+
// On return *cur_ptr points at the next element node, or NULL if the siblings
76+
// are exhausted; callers can therefore resume iterating element children
77+
// directly.
78+
//
79+
// Returns the combined text, or NULL if there was no character data (only
80+
// comments or nothing at all).
81+
//
82+
// Note: cur_ptr is passed by reference (pointer to pointer) so we can advance
83+
// the caller's cursor past everything we consumed.
7584
static B_str collect_text_cdata_nodes(xmlNodePtr *cur_ptr) {
85+
// First pass: sum the length of all text/CDATA content and locate the
86+
// element node (or end of siblings) where collection stops.
87+
size_t text_len = 0;
7688
xmlNodePtr cur = *cur_ptr;
77-
if (!cur || (cur->type != XML_TEXT_NODE && cur->type != XML_CDATA_SECTION_NODE)) {
78-
return NULL;
89+
while (cur && cur->type != XML_ELEMENT_NODE) {
90+
switch (cur->type) {
91+
case XML_TEXT_NODE:
92+
case XML_CDATA_SECTION_NODE:
93+
if (cur->content)
94+
text_len += strlen((char *)cur->content);
95+
break;
96+
case XML_COMMENT_NODE:
97+
break;
98+
default:
99+
RAISE(xmlQ_XmlParseError, $FORMAT("Unsupported XML node type %d", cur->type), NULL, NULL);
100+
}
101+
cur = cur->next;
79102
}
103+
xmlNodePtr stop = cur;
80104

81-
// Count total length of combined text and CDATA nodes
82-
size_t text_len = 0;
83-
xmlNodePtr text_start = cur;
84-
while (cur && (cur->type == XML_TEXT_NODE || cur->type == XML_CDATA_SECTION_NODE)) {
85-
if (cur->content) text_len += strlen((char *)cur->content);
86-
cur = cur->next;
105+
if (text_len == 0) {
106+
*cur_ptr = stop;
107+
return NULL;
87108
}
88109

89-
// Create combined string
90-
if (text_len > 0) {
91-
char *combined = acton_malloc_atomic(text_len + 1);
92-
char *p = combined;
93-
xmlNodePtr t = text_start;
94-
while (t != cur) {
95-
if (t->content) {
96-
size_t len = strlen((char *)t->content);
97-
memcpy(p, t->content, len);
98-
p += len;
99-
}
100-
t = t->next;
110+
// Second pass: concatenate the text/CDATA content into a single string.
111+
char *combined = acton_malloc_atomic(text_len + 1);
112+
char *p = combined;
113+
for (cur = *cur_ptr; cur != stop; cur = cur->next) {
114+
if ((cur->type == XML_TEXT_NODE || cur->type == XML_CDATA_SECTION_NODE) && cur->content) {
115+
size_t len = strlen((char *)cur->content);
116+
memcpy(p, cur->content, len);
117+
p += len;
101118
}
102-
*p = '\0';
103-
B_str result = to_str_noc(combined);
104-
*cur_ptr = cur;
105-
return result;
106119
}
120+
*p = '\0';
107121

108-
*cur_ptr = cur;
109-
return NULL;
122+
*cur_ptr = stop;
123+
return to_str_noc(combined);
110124
}
111125

126+
// Convert a libxml2 element node into an Acton xml.Node.
127+
//
128+
// The returned Node has tail == NULL. An element's tail (the character data
129+
// following it, up to its next sibling element) belongs to the parent's child
130+
// sequence, so it is filled in by the caller while iterating siblings.
112131
xmlQ_Node $NodePtr2Node(xmlNodePtr node) {
113132
B_SequenceD_list wit = B_SequenceD_listG_witness;
114-
if (node->type == XML_COMMENT_NODE) {
115-
return NULL;
116-
}
117-
if (node->type != XML_ELEMENT_NODE) {
118-
char *errmsg = NULL;
133+
if (node->type != XML_ELEMENT_NODE)
119134
RAISE(xmlQ_XmlParseError, $FORMAT("Unexpected nodetype %d, content is %s", node->type, node->content), NULL, NULL);
120-
}
121135

122136
B_list nsdefs = B_listG_new(NULL, NULL);
123137
xmlNsPtr nsDef = node->nsDef;
@@ -156,25 +170,21 @@ xmlQ_Node $NodePtr2Node(xmlNodePtr node) {
156170
B_list children = B_listG_new(NULL, NULL);
157171
xmlNodePtr cur = node->xmlChildrenNode;
158172

159-
// Collect initial text/CDATA nodes
173+
// Character data before the first child element becomes this node's text.
160174
B_str text = collect_text_cdata_nodes(&cur);
161175

176+
// collect_text_cdata_nodes stops only at element nodes (cur is updated), so
177+
// every node seen here is an element. Recurse into it, then collect the
178+
// character data that follows it (up to the next element) as that child's
179+
// tail.
162180
while (cur != NULL) {
163181
xmlQ_Node child = $NodePtr2Node(cur);
164-
if (child)
165-
wit->$class->append(wit,children, child);
166182
cur = cur->next;
183+
child->tail = collect_text_cdata_nodes(&cur);
184+
wit->$class->append(wit, children, child);
167185
}
168186

169-
// Collect tail text/CDATA nodes after we have exhausted the child nodes
170-
cur = node->next;
171-
B_str tail = collect_text_cdata_nodes(&cur);
172-
// Update the tree structure to skip consumed tail text/CDATA nodes.
173-
// This prevents the parent from seeing these text nodes again during its
174-
// child iteration, since tail text of an element is part of the parent's
175-
// child list in the XML tree.
176-
node->next = cur;
177-
return (xmlQ_Node)$NEW(xmlQ_Node, to$str((char *)node->name), nsdefs, prefix, attributes, children, text, tail);
187+
return (xmlQ_Node)$NEW(xmlQ_Node, to$str((char *)node->name), nsdefs, prefix, attributes, children, text, NULL);
178188
}
179189

180190
xmlQ_Node xmlQ_decode(B_str data) {
@@ -217,6 +227,10 @@ xmlQ_Node xmlQ_decode(B_str data) {
217227
RAISE(xmlQ_XmlParseError, errmsg, line, column);
218228
}
219229
xmlNodePtr root = xmlDocGetRootElement(doc);
230+
if (!root) {
231+
xmlFreeDoc(doc);
232+
RAISE(xmlQ_XmlParseError, to$str("Document has no root element"), NULL, NULL);
233+
}
220234
xmlQ_Node t = $NodePtr2Node(root);
221235
xmlFreeDoc(doc);
222236
return t;

std/src/std/xml.ext.c

Lines changed: 62 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -68,56 +68,70 @@ static unsigned char* copy_with_xml_escape(unsigned char *dst, B_str src, int es
6868
return dst;
6969
}
7070

71-
// Helper function to collect text from consecutive TEXT and CDATA nodes
72-
// Returns the combined string and updates the node pointer to the first non-text node
73-
// Note: cur_ptr is passed by reference (pointer to pointer) so we can update the caller's pointer
74-
// to skip past all consumed text/CDATA nodes
71+
// Collect character data (text and CDATA) starting at *cur_ptr, up to but not
72+
// including the next element node. Comments are skipped over transparently, so
73+
// text on either side of a comment is concatenated (including whitespace).
74+
// Unsupported node types are rejected.
75+
// On return *cur_ptr points at the next element node, or NULL if the siblings
76+
// are exhausted; callers can therefore resume iterating element children
77+
// directly.
78+
//
79+
// Returns the combined text, or NULL if there was no character data (only
80+
// comments or nothing at all).
81+
//
82+
// Note: cur_ptr is passed by reference (pointer to pointer) so we can advance
83+
// the caller's cursor past everything we consumed.
7584
static B_str collect_text_cdata_nodes(xmlNodePtr *cur_ptr) {
85+
// First pass: sum the length of all text/CDATA content and locate the
86+
// element node (or end of siblings) where collection stops.
87+
size_t text_len = 0;
7688
xmlNodePtr cur = *cur_ptr;
77-
if (!cur || (cur->type != XML_TEXT_NODE && cur->type != XML_CDATA_SECTION_NODE)) {
78-
return NULL;
89+
while (cur && cur->type != XML_ELEMENT_NODE) {
90+
switch (cur->type) {
91+
case XML_TEXT_NODE:
92+
case XML_CDATA_SECTION_NODE:
93+
if (cur->content)
94+
text_len += strlen((char *)cur->content);
95+
break;
96+
case XML_COMMENT_NODE:
97+
break;
98+
default:
99+
RAISE(stdQ_xmlQ_XmlParseError, $FORMAT("Unsupported XML node type %d", cur->type), NULL, NULL);
100+
}
101+
cur = cur->next;
79102
}
103+
xmlNodePtr stop = cur;
80104

81-
// Count total length of combined text and CDATA nodes
82-
size_t text_len = 0;
83-
xmlNodePtr text_start = cur;
84-
while (cur && (cur->type == XML_TEXT_NODE || cur->type == XML_CDATA_SECTION_NODE)) {
85-
if (cur->content) text_len += strlen((char *)cur->content);
86-
cur = cur->next;
105+
if (text_len == 0) {
106+
*cur_ptr = stop;
107+
return NULL;
87108
}
88109

89-
// Create combined string
90-
if (text_len > 0) {
91-
char *combined = acton_malloc_atomic(text_len + 1);
92-
char *p = combined;
93-
xmlNodePtr t = text_start;
94-
while (t != cur) {
95-
if (t->content) {
96-
size_t len = strlen((char *)t->content);
97-
memcpy(p, t->content, len);
98-
p += len;
99-
}
100-
t = t->next;
110+
// Second pass: concatenate the text/CDATA content into a single string.
111+
char *combined = acton_malloc_atomic(text_len + 1);
112+
char *p = combined;
113+
for (cur = *cur_ptr; cur != stop; cur = cur->next) {
114+
if ((cur->type == XML_TEXT_NODE || cur->type == XML_CDATA_SECTION_NODE) && cur->content) {
115+
size_t len = strlen((char *)cur->content);
116+
memcpy(p, cur->content, len);
117+
p += len;
101118
}
102-
*p = '\0';
103-
B_str result = to_str_noc(combined);
104-
*cur_ptr = cur;
105-
return result;
106119
}
120+
*p = '\0';
107121

108-
*cur_ptr = cur;
109-
return NULL;
122+
*cur_ptr = stop;
123+
return to_str_noc(combined);
110124
}
111125

126+
// Convert a libxml2 element node into an Acton xml.Node.
127+
//
128+
// The returned Node has tail == NULL. An element's tail (the character data
129+
// following it, up to its next sibling element) belongs to the parent's child
130+
// sequence, so it is filled in by the caller while iterating siblings.
112131
stdQ_xmlQ_Node stdQ_xmlQ_NodePtr2Node(xmlNodePtr node) {
113132
B_SequenceD_list wit = B_SequenceD_listG_witness;
114-
if (node->type == XML_COMMENT_NODE) {
115-
return NULL;
116-
}
117-
if (node->type != XML_ELEMENT_NODE) {
118-
char *errmsg = NULL;
133+
if (node->type != XML_ELEMENT_NODE)
119134
RAISE(stdQ_xmlQ_XmlParseError, $FORMAT("Unexpected nodetype %d, content is %s", node->type, node->content), NULL, NULL);
120-
}
121135

122136
B_list nsdefs = B_listG_new(NULL, NULL);
123137
xmlNsPtr nsDef = node->nsDef;
@@ -156,25 +170,21 @@ stdQ_xmlQ_Node stdQ_xmlQ_NodePtr2Node(xmlNodePtr node) {
156170
B_list children = B_listG_new(NULL, NULL);
157171
xmlNodePtr cur = node->xmlChildrenNode;
158172

159-
// Collect initial text/CDATA nodes
173+
// Character data before the first child element becomes this node's text.
160174
B_str text = collect_text_cdata_nodes(&cur);
161175

176+
// collect_text_cdata_nodes stops only at element nodes (cur is updated), so
177+
// every node seen here is an element. Recurse into it, then collect the
178+
// character data that follows it (up to the next element) as that child's
179+
// tail.
162180
while (cur != NULL) {
163181
stdQ_xmlQ_Node child = stdQ_xmlQ_NodePtr2Node(cur);
164-
if (child)
165-
wit->$class->append(wit,children, child);
166182
cur = cur->next;
183+
child->tail = collect_text_cdata_nodes(&cur);
184+
wit->$class->append(wit, children, child);
167185
}
168186

169-
// Collect tail text/CDATA nodes after we have exhausted the child nodes
170-
cur = node->next;
171-
B_str tail = collect_text_cdata_nodes(&cur);
172-
// Update the tree structure to skip consumed tail text/CDATA nodes.
173-
// This prevents the parent from seeing these text nodes again during its
174-
// child iteration, since tail text of an element is part of the parent's
175-
// child list in the XML tree.
176-
node->next = cur;
177-
return (stdQ_xmlQ_Node)$NEW(stdQ_xmlQ_Node, to$str((char *)node->name), nsdefs, prefix, attributes, children, text, tail);
187+
return (stdQ_xmlQ_Node)$NEW(stdQ_xmlQ_Node, to$str((char *)node->name), nsdefs, prefix, attributes, children, text, NULL);
178188
}
179189

180190
stdQ_xmlQ_Node stdQ_xmlQ_decode(B_str data) {
@@ -217,6 +227,10 @@ stdQ_xmlQ_Node stdQ_xmlQ_decode(B_str data) {
217227
RAISE(stdQ_xmlQ_XmlParseError, errmsg, line, column);
218228
}
219229
xmlNodePtr root = xmlDocGetRootElement(doc);
230+
if (!root) {
231+
xmlFreeDoc(doc);
232+
RAISE(stdQ_xmlQ_XmlParseError, to$str("Document has no root element"), NULL, NULL);
233+
}
220234
stdQ_xmlQ_Node t = stdQ_xmlQ_NodePtr2Node(root);
221235
xmlFreeDoc(doc);
222236
return t;

test/stdlib_tests/src/test_xml.act

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,3 +213,10 @@ def _test_xml_pretty():
213213
)
214214

215215
return root.encode(True)
216+
217+
def _test_xml_comment():
218+
sn = r"""<data><!-- comment -->b</data>"""
219+
d = xml.decode(sn)
220+
e = xml.encode(d)
221+
expected = r"""<data>b</data>"""
222+
testing.assertEqual(e, expected, "comment should be removed but node text preserved")

0 commit comments

Comments
 (0)