Skip to content

Commit 4e4f675

Browse files
authored
Merge pull request #2886 from actonlang/xml-comment
Fix processing comments in XML
2 parents 1d2a9f9 + 85acf85 commit 4e4f675

3 files changed

Lines changed: 131 additions & 96 deletions

File tree

base/src/xml.ext.c

Lines changed: 62 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -68,56 +68,70 @@ static unsigned char* copy_with_xml_escape(unsigned char *dst, B_str src, int es
6868
return dst;
6969
}
7070

71-
// Helper function to collect text from consecutive TEXT and CDATA nodes
72-
// Returns the combined string and updates the node pointer to the first non-text node
73-
// Note: cur_ptr is passed by reference (pointer to pointer) so we can update the caller's pointer
74-
// to skip past all consumed text/CDATA nodes
71+
// Collect character data (text and CDATA) starting at *cur_ptr, up to but not
72+
// including the next element node. Comments are skipped over transparently, so
73+
// text on either side of a comment is concatenated (including whitespace).
74+
// Unsupported node types are rejected.
75+
// On return *cur_ptr points at the next element node, or NULL if the siblings
76+
// are exhausted; callers can therefore resume iterating element children
77+
// directly.
78+
//
79+
// Returns the combined text, or NULL if there was no character data (only
80+
// comments or nothing at all).
81+
//
82+
// Note: cur_ptr is passed by reference (pointer to pointer) so we can advance
83+
// the caller's cursor past everything we consumed.
7584
static B_str collect_text_cdata_nodes(xmlNodePtr *cur_ptr) {
85+
// First pass: sum the length of all text/CDATA content and locate the
86+
// element node (or end of siblings) where collection stops.
87+
size_t text_len = 0;
7688
xmlNodePtr cur = *cur_ptr;
77-
if (!cur || (cur->type != XML_TEXT_NODE && cur->type != XML_CDATA_SECTION_NODE)) {
78-
return NULL;
89+
while (cur && cur->type != XML_ELEMENT_NODE) {
90+
switch (cur->type) {
91+
case XML_TEXT_NODE:
92+
case XML_CDATA_SECTION_NODE:
93+
if (cur->content)
94+
text_len += strlen((char *)cur->content);
95+
break;
96+
case XML_COMMENT_NODE:
97+
break;
98+
default:
99+
RAISE(xmlQ_XmlParseError, $FORMAT("Unsupported XML node type %d", cur->type), NULL, NULL);
100+
}
101+
cur = cur->next;
79102
}
103+
xmlNodePtr stop = cur;
80104

81-
// Count total length of combined text and CDATA nodes
82-
size_t text_len = 0;
83-
xmlNodePtr text_start = cur;
84-
while (cur && (cur->type == XML_TEXT_NODE || cur->type == XML_CDATA_SECTION_NODE)) {
85-
if (cur->content) text_len += strlen((char *)cur->content);
86-
cur = cur->next;
105+
if (text_len == 0) {
106+
*cur_ptr = stop;
107+
return NULL;
87108
}
88109

89-
// Create combined string
90-
if (text_len > 0) {
91-
char *combined = acton_malloc_atomic(text_len + 1);
92-
char *p = combined;
93-
xmlNodePtr t = text_start;
94-
while (t != cur) {
95-
if (t->content) {
96-
size_t len = strlen((char *)t->content);
97-
memcpy(p, t->content, len);
98-
p += len;
99-
}
100-
t = t->next;
110+
// Second pass: concatenate the text/CDATA content into a single string.
111+
char *combined = acton_malloc_atomic(text_len + 1);
112+
char *p = combined;
113+
for (cur = *cur_ptr; cur != stop; cur = cur->next) {
114+
if ((cur->type == XML_TEXT_NODE || cur->type == XML_CDATA_SECTION_NODE) && cur->content) {
115+
size_t len = strlen((char *)cur->content);
116+
memcpy(p, cur->content, len);
117+
p += len;
101118
}
102-
*p = '\0';
103-
B_str result = to_str_noc(combined);
104-
*cur_ptr = cur;
105-
return result;
106119
}
120+
*p = '\0';
107121

108-
*cur_ptr = cur;
109-
return NULL;
122+
*cur_ptr = stop;
123+
return to_str_noc(combined);
110124
}
111125

126+
// Convert a libxml2 element node into an Acton xml.Node.
127+
//
128+
// The returned Node has tail == NULL. An element's tail (the character data
129+
// following it, up to its next sibling element) belongs to the parent's child
130+
// sequence, so it is filled in by the caller while iterating siblings.
112131
xmlQ_Node $NodePtr2Node(xmlNodePtr node) {
113132
B_SequenceD_list wit = B_SequenceD_listG_witness;
114-
if (node->type == XML_COMMENT_NODE) {
115-
return NULL;
116-
}
117-
if (node->type != XML_ELEMENT_NODE) {
118-
char *errmsg = NULL;
133+
if (node->type != XML_ELEMENT_NODE)
119134
RAISE(xmlQ_XmlParseError, $FORMAT("Unexpected nodetype %d, content is %s", node->type, node->content), NULL, NULL);
120-
}
121135

122136
B_list nsdefs = B_listG_new(NULL, NULL);
123137
xmlNsPtr nsDef = node->nsDef;
@@ -156,25 +170,21 @@ xmlQ_Node $NodePtr2Node(xmlNodePtr node) {
156170
B_list children = B_listG_new(NULL, NULL);
157171
xmlNodePtr cur = node->xmlChildrenNode;
158172

159-
// Collect initial text/CDATA nodes
173+
// Character data before the first child element becomes this node's text.
160174
B_str text = collect_text_cdata_nodes(&cur);
161175

176+
// collect_text_cdata_nodes stops only at element nodes (cur is updated), so
177+
// every node seen here is an element. Recurse into it, then collect the
178+
// character data that follows it (up to the next element) as that child's
179+
// tail.
162180
while (cur != NULL) {
163181
xmlQ_Node child = $NodePtr2Node(cur);
164-
if (child)
165-
wit->$class->append(wit,children, child);
166182
cur = cur->next;
183+
child->tail = collect_text_cdata_nodes(&cur);
184+
wit->$class->append(wit, children, child);
167185
}
168186

169-
// Collect tail text/CDATA nodes after we have exhausted the child nodes
170-
cur = node->next;
171-
B_str tail = collect_text_cdata_nodes(&cur);
172-
// Update the tree structure to skip consumed tail text/CDATA nodes.
173-
// This prevents the parent from seeing these text nodes again during its
174-
// child iteration, since tail text of an element is part of the parent's
175-
// child list in the XML tree.
176-
node->next = cur;
177-
return (xmlQ_Node)$NEW(xmlQ_Node, to$str((char *)node->name), nsdefs, prefix, attributes, children, text, tail);
187+
return (xmlQ_Node)$NEW(xmlQ_Node, to$str((char *)node->name), nsdefs, prefix, attributes, children, text, NULL);
178188
}
179189

180190
xmlQ_Node xmlQ_decode(B_str data) {
@@ -217,6 +227,10 @@ xmlQ_Node xmlQ_decode(B_str data) {
217227
RAISE(xmlQ_XmlParseError, errmsg, line, column);
218228
}
219229
xmlNodePtr root = xmlDocGetRootElement(doc);
230+
if (!root) {
231+
xmlFreeDoc(doc);
232+
RAISE(xmlQ_XmlParseError, to$str("Document has no root element"), NULL, NULL);
233+
}
220234
xmlQ_Node t = $NodePtr2Node(root);
221235
xmlFreeDoc(doc);
222236
return t;

std/src/std/xml.ext.c

Lines changed: 62 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -68,56 +68,70 @@ static unsigned char* copy_with_xml_escape(unsigned char *dst, B_str src, int es
6868
return dst;
6969
}
7070

71-
// Helper function to collect text from consecutive TEXT and CDATA nodes
72-
// Returns the combined string and updates the node pointer to the first non-text node
73-
// Note: cur_ptr is passed by reference (pointer to pointer) so we can update the caller's pointer
74-
// to skip past all consumed text/CDATA nodes
71+
// Collect character data (text and CDATA) starting at *cur_ptr, up to but not
72+
// including the next element node. Comments are skipped over transparently, so
73+
// text on either side of a comment is concatenated (including whitespace).
74+
// Unsupported node types are rejected.
75+
// On return *cur_ptr points at the next element node, or NULL if the siblings
76+
// are exhausted; callers can therefore resume iterating element children
77+
// directly.
78+
//
79+
// Returns the combined text, or NULL if there was no character data (only
80+
// comments or nothing at all).
81+
//
82+
// Note: cur_ptr is passed by reference (pointer to pointer) so we can advance
83+
// the caller's cursor past everything we consumed.
7584
static B_str collect_text_cdata_nodes(xmlNodePtr *cur_ptr) {
85+
// First pass: sum the length of all text/CDATA content and locate the
86+
// element node (or end of siblings) where collection stops.
87+
size_t text_len = 0;
7688
xmlNodePtr cur = *cur_ptr;
77-
if (!cur || (cur->type != XML_TEXT_NODE && cur->type != XML_CDATA_SECTION_NODE)) {
78-
return NULL;
89+
while (cur && cur->type != XML_ELEMENT_NODE) {
90+
switch (cur->type) {
91+
case XML_TEXT_NODE:
92+
case XML_CDATA_SECTION_NODE:
93+
if (cur->content)
94+
text_len += strlen((char *)cur->content);
95+
break;
96+
case XML_COMMENT_NODE:
97+
break;
98+
default:
99+
RAISE(stdQ_xmlQ_XmlParseError, $FORMAT("Unsupported XML node type %d", cur->type), NULL, NULL);
100+
}
101+
cur = cur->next;
79102
}
103+
xmlNodePtr stop = cur;
80104

81-
// Count total length of combined text and CDATA nodes
82-
size_t text_len = 0;
83-
xmlNodePtr text_start = cur;
84-
while (cur && (cur->type == XML_TEXT_NODE || cur->type == XML_CDATA_SECTION_NODE)) {
85-
if (cur->content) text_len += strlen((char *)cur->content);
86-
cur = cur->next;
105+
if (text_len == 0) {
106+
*cur_ptr = stop;
107+
return NULL;
87108
}
88109

89-
// Create combined string
90-
if (text_len > 0) {
91-
char *combined = acton_malloc_atomic(text_len + 1);
92-
char *p = combined;
93-
xmlNodePtr t = text_start;
94-
while (t != cur) {
95-
if (t->content) {
96-
size_t len = strlen((char *)t->content);
97-
memcpy(p, t->content, len);
98-
p += len;
99-
}
100-
t = t->next;
110+
// Second pass: concatenate the text/CDATA content into a single string.
111+
char *combined = acton_malloc_atomic(text_len + 1);
112+
char *p = combined;
113+
for (cur = *cur_ptr; cur != stop; cur = cur->next) {
114+
if ((cur->type == XML_TEXT_NODE || cur->type == XML_CDATA_SECTION_NODE) && cur->content) {
115+
size_t len = strlen((char *)cur->content);
116+
memcpy(p, cur->content, len);
117+
p += len;
101118
}
102-
*p = '\0';
103-
B_str result = to_str_noc(combined);
104-
*cur_ptr = cur;
105-
return result;
106119
}
120+
*p = '\0';
107121

108-
*cur_ptr = cur;
109-
return NULL;
122+
*cur_ptr = stop;
123+
return to_str_noc(combined);
110124
}
111125

126+
// Convert a libxml2 element node into an Acton xml.Node.
127+
//
128+
// The returned Node has tail == NULL. An element's tail (the character data
129+
// following it, up to its next sibling element) belongs to the parent's child
130+
// sequence, so it is filled in by the caller while iterating siblings.
112131
stdQ_xmlQ_Node stdQ_xmlQ_NodePtr2Node(xmlNodePtr node) {
113132
B_SequenceD_list wit = B_SequenceD_listG_witness;
114-
if (node->type == XML_COMMENT_NODE) {
115-
return NULL;
116-
}
117-
if (node->type != XML_ELEMENT_NODE) {
118-
char *errmsg = NULL;
133+
if (node->type != XML_ELEMENT_NODE)
119134
RAISE(stdQ_xmlQ_XmlParseError, $FORMAT("Unexpected nodetype %d, content is %s", node->type, node->content), NULL, NULL);
120-
}
121135

122136
B_list nsdefs = B_listG_new(NULL, NULL);
123137
xmlNsPtr nsDef = node->nsDef;
@@ -156,25 +170,21 @@ stdQ_xmlQ_Node stdQ_xmlQ_NodePtr2Node(xmlNodePtr node) {
156170
B_list children = B_listG_new(NULL, NULL);
157171
xmlNodePtr cur = node->xmlChildrenNode;
158172

159-
// Collect initial text/CDATA nodes
173+
// Character data before the first child element becomes this node's text.
160174
B_str text = collect_text_cdata_nodes(&cur);
161175

176+
// collect_text_cdata_nodes stops only at element nodes (cur is updated), so
177+
// every node seen here is an element. Recurse into it, then collect the
178+
// character data that follows it (up to the next element) as that child's
179+
// tail.
162180
while (cur != NULL) {
163181
stdQ_xmlQ_Node child = stdQ_xmlQ_NodePtr2Node(cur);
164-
if (child)
165-
wit->$class->append(wit,children, child);
166182
cur = cur->next;
183+
child->tail = collect_text_cdata_nodes(&cur);
184+
wit->$class->append(wit, children, child);
167185
}
168186

169-
// Collect tail text/CDATA nodes after we have exhausted the child nodes
170-
cur = node->next;
171-
B_str tail = collect_text_cdata_nodes(&cur);
172-
// Update the tree structure to skip consumed tail text/CDATA nodes.
173-
// This prevents the parent from seeing these text nodes again during its
174-
// child iteration, since tail text of an element is part of the parent's
175-
// child list in the XML tree.
176-
node->next = cur;
177-
return (stdQ_xmlQ_Node)$NEW(stdQ_xmlQ_Node, to$str((char *)node->name), nsdefs, prefix, attributes, children, text, tail);
187+
return (stdQ_xmlQ_Node)$NEW(stdQ_xmlQ_Node, to$str((char *)node->name), nsdefs, prefix, attributes, children, text, NULL);
178188
}
179189

180190
stdQ_xmlQ_Node stdQ_xmlQ_decode(B_str data) {
@@ -217,6 +227,10 @@ stdQ_xmlQ_Node stdQ_xmlQ_decode(B_str data) {
217227
RAISE(stdQ_xmlQ_XmlParseError, errmsg, line, column);
218228
}
219229
xmlNodePtr root = xmlDocGetRootElement(doc);
230+
if (!root) {
231+
xmlFreeDoc(doc);
232+
RAISE(stdQ_xmlQ_XmlParseError, to$str("Document has no root element"), NULL, NULL);
233+
}
220234
stdQ_xmlQ_Node t = stdQ_xmlQ_NodePtr2Node(root);
221235
xmlFreeDoc(doc);
222236
return t;

test/stdlib_tests/src/test_xml.act

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,3 +213,10 @@ def _test_xml_pretty():
213213
)
214214

215215
return root.encode(True)
216+
217+
def _test_xml_comment():
218+
sn = r"""<data><!-- comment -->b</data>"""
219+
d = xml.decode(sn)
220+
e = xml.encode(d)
221+
expected = r"""<data>b</data>"""
222+
testing.assertEqual(e, expected, "comment should be removed but node text preserved")

0 commit comments

Comments
 (0)