Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 62 additions & 48 deletions base/src/xml.ext.c
Original file line number Diff line number Diff line change
Expand Up @@ -68,56 +68,70 @@ static unsigned char* copy_with_xml_escape(unsigned char *dst, B_str src, int es
return dst;
}

// Helper function to collect text from consecutive TEXT and CDATA nodes
// Returns the combined string and updates the node pointer to the first non-text node
// Note: cur_ptr is passed by reference (pointer to pointer) so we can update the caller's pointer
// to skip past all consumed text/CDATA nodes
// Collect character data (text and CDATA) starting at *cur_ptr, up to but not
// including the next element node. Comments are skipped over transparently, so
// text on either side of a comment is concatenated (including whitespace).
// Unsupported node types are rejected.
// On return *cur_ptr points at the next element node, or NULL if the siblings
// are exhausted; callers can therefore resume iterating element children
// directly.
//
// Returns the combined text, or NULL if there was no character data (only
// comments or nothing at all).
//
// Note: cur_ptr is passed by reference (pointer to pointer) so we can advance
// the caller's cursor past everything we consumed.
static B_str collect_text_cdata_nodes(xmlNodePtr *cur_ptr) {
// First pass: sum the length of all text/CDATA content and locate the
// element node (or end of siblings) where collection stops.
size_t text_len = 0;
xmlNodePtr cur = *cur_ptr;
if (!cur || (cur->type != XML_TEXT_NODE && cur->type != XML_CDATA_SECTION_NODE)) {
return NULL;
while (cur && cur->type != XML_ELEMENT_NODE) {
switch (cur->type) {
case XML_TEXT_NODE:
case XML_CDATA_SECTION_NODE:
if (cur->content)
text_len += strlen((char *)cur->content);
break;
case XML_COMMENT_NODE:
break;
default:
RAISE(xmlQ_XmlParseError, $FORMAT("Unsupported XML node type %d", cur->type), NULL, NULL);
}
cur = cur->next;
}
xmlNodePtr stop = cur;

// Count total length of combined text and CDATA nodes
size_t text_len = 0;
xmlNodePtr text_start = cur;
while (cur && (cur->type == XML_TEXT_NODE || cur->type == XML_CDATA_SECTION_NODE)) {
if (cur->content) text_len += strlen((char *)cur->content);
cur = cur->next;
if (text_len == 0) {
*cur_ptr = stop;
return NULL;
}

// Create combined string
if (text_len > 0) {
char *combined = acton_malloc_atomic(text_len + 1);
char *p = combined;
xmlNodePtr t = text_start;
while (t != cur) {
if (t->content) {
size_t len = strlen((char *)t->content);
memcpy(p, t->content, len);
p += len;
}
t = t->next;
// Second pass: concatenate the text/CDATA content into a single string.
char *combined = acton_malloc_atomic(text_len + 1);
char *p = combined;
for (cur = *cur_ptr; cur != stop; cur = cur->next) {
if ((cur->type == XML_TEXT_NODE || cur->type == XML_CDATA_SECTION_NODE) && cur->content) {
size_t len = strlen((char *)cur->content);
memcpy(p, cur->content, len);
p += len;
}
*p = '\0';
B_str result = to_str_noc(combined);
*cur_ptr = cur;
return result;
}
*p = '\0';

*cur_ptr = cur;
return NULL;
*cur_ptr = stop;
return to_str_noc(combined);
}

// Convert a libxml2 element node into an Acton xml.Node.
//
// The returned Node has tail == NULL. An element's tail (the character data
// following it, up to its next sibling element) belongs to the parent's child
// sequence, so it is filled in by the caller while iterating siblings.
xmlQ_Node $NodePtr2Node(xmlNodePtr node) {
B_SequenceD_list wit = B_SequenceD_listG_witness;
if (node->type == XML_COMMENT_NODE) {
return NULL;
}
if (node->type != XML_ELEMENT_NODE) {
char *errmsg = NULL;
if (node->type != XML_ELEMENT_NODE)
RAISE(xmlQ_XmlParseError, $FORMAT("Unexpected nodetype %d, content is %s", node->type, node->content), NULL, NULL);
}

B_list nsdefs = B_listG_new(NULL, NULL);
xmlNsPtr nsDef = node->nsDef;
Expand Down Expand Up @@ -156,25 +170,21 @@ xmlQ_Node $NodePtr2Node(xmlNodePtr node) {
B_list children = B_listG_new(NULL, NULL);
xmlNodePtr cur = node->xmlChildrenNode;

// Collect initial text/CDATA nodes
// Character data before the first child element becomes this node's text.
B_str text = collect_text_cdata_nodes(&cur);

// collect_text_cdata_nodes stops only at element nodes (cur is updated), so
// every node seen here is an element. Recurse into it, then collect the
// character data that follows it (up to the next element) as that child's
// tail.
while (cur != NULL) {
xmlQ_Node child = $NodePtr2Node(cur);
if (child)
wit->$class->append(wit,children, child);
cur = cur->next;
child->tail = collect_text_cdata_nodes(&cur);
wit->$class->append(wit, children, child);
}

// Collect tail text/CDATA nodes after we have exhausted the child nodes
cur = node->next;
B_str tail = collect_text_cdata_nodes(&cur);
// Update the tree structure to skip consumed tail text/CDATA nodes.
// This prevents the parent from seeing these text nodes again during its
// child iteration, since tail text of an element is part of the parent's
// child list in the XML tree.
node->next = cur;
return (xmlQ_Node)$NEW(xmlQ_Node, to$str((char *)node->name), nsdefs, prefix, attributes, children, text, tail);
return (xmlQ_Node)$NEW(xmlQ_Node, to$str((char *)node->name), nsdefs, prefix, attributes, children, text, NULL);
}

xmlQ_Node xmlQ_decode(B_str data) {
Expand Down Expand Up @@ -217,6 +227,10 @@ xmlQ_Node xmlQ_decode(B_str data) {
RAISE(xmlQ_XmlParseError, errmsg, line, column);
}
xmlNodePtr root = xmlDocGetRootElement(doc);
if (!root) {
xmlFreeDoc(doc);
RAISE(xmlQ_XmlParseError, to$str("Document has no root element"), NULL, NULL);
}
xmlQ_Node t = $NodePtr2Node(root);
xmlFreeDoc(doc);
return t;
Expand Down
110 changes: 62 additions & 48 deletions std/src/std/xml.ext.c
Original file line number Diff line number Diff line change
Expand Up @@ -68,56 +68,70 @@ static unsigned char* copy_with_xml_escape(unsigned char *dst, B_str src, int es
return dst;
}

// Helper function to collect text from consecutive TEXT and CDATA nodes
// Returns the combined string and updates the node pointer to the first non-text node
// Note: cur_ptr is passed by reference (pointer to pointer) so we can update the caller's pointer
// to skip past all consumed text/CDATA nodes
// Collect character data (text and CDATA) starting at *cur_ptr, up to but not
// including the next element node. Comments are skipped over transparently, so
// text on either side of a comment is concatenated (including whitespace).
// Unsupported node types are rejected.
// On return *cur_ptr points at the next element node, or NULL if the siblings
// are exhausted; callers can therefore resume iterating element children
// directly.
//
// Returns the combined text, or NULL if there was no character data (only
// comments or nothing at all).
//
// Note: cur_ptr is passed by reference (pointer to pointer) so we can advance
// the caller's cursor past everything we consumed.
static B_str collect_text_cdata_nodes(xmlNodePtr *cur_ptr) {
// First pass: sum the length of all text/CDATA content and locate the
// element node (or end of siblings) where collection stops.
size_t text_len = 0;
xmlNodePtr cur = *cur_ptr;
if (!cur || (cur->type != XML_TEXT_NODE && cur->type != XML_CDATA_SECTION_NODE)) {
return NULL;
while (cur && cur->type != XML_ELEMENT_NODE) {
switch (cur->type) {
case XML_TEXT_NODE:
case XML_CDATA_SECTION_NODE:
if (cur->content)
text_len += strlen((char *)cur->content);
break;
case XML_COMMENT_NODE:
break;
default:
RAISE(stdQ_xmlQ_XmlParseError, $FORMAT("Unsupported XML node type %d", cur->type), NULL, NULL);
}
cur = cur->next;
}
xmlNodePtr stop = cur;

// Count total length of combined text and CDATA nodes
size_t text_len = 0;
xmlNodePtr text_start = cur;
while (cur && (cur->type == XML_TEXT_NODE || cur->type == XML_CDATA_SECTION_NODE)) {
if (cur->content) text_len += strlen((char *)cur->content);
cur = cur->next;
if (text_len == 0) {
*cur_ptr = stop;
return NULL;
}

// Create combined string
if (text_len > 0) {
char *combined = acton_malloc_atomic(text_len + 1);
char *p = combined;
xmlNodePtr t = text_start;
while (t != cur) {
if (t->content) {
size_t len = strlen((char *)t->content);
memcpy(p, t->content, len);
p += len;
}
t = t->next;
// Second pass: concatenate the text/CDATA content into a single string.
char *combined = acton_malloc_atomic(text_len + 1);
char *p = combined;
for (cur = *cur_ptr; cur != stop; cur = cur->next) {
if ((cur->type == XML_TEXT_NODE || cur->type == XML_CDATA_SECTION_NODE) && cur->content) {
size_t len = strlen((char *)cur->content);
memcpy(p, cur->content, len);
p += len;
}
*p = '\0';
B_str result = to_str_noc(combined);
*cur_ptr = cur;
return result;
}
*p = '\0';

*cur_ptr = cur;
return NULL;
*cur_ptr = stop;
return to_str_noc(combined);
}

// Convert a libxml2 element node into an Acton xml.Node.
//
// The returned Node has tail == NULL. An element's tail (the character data
// following it, up to its next sibling element) belongs to the parent's child
// sequence, so it is filled in by the caller while iterating siblings.
stdQ_xmlQ_Node stdQ_xmlQ_NodePtr2Node(xmlNodePtr node) {
B_SequenceD_list wit = B_SequenceD_listG_witness;
if (node->type == XML_COMMENT_NODE) {
return NULL;
}
if (node->type != XML_ELEMENT_NODE) {
char *errmsg = NULL;
if (node->type != XML_ELEMENT_NODE)
RAISE(stdQ_xmlQ_XmlParseError, $FORMAT("Unexpected nodetype %d, content is %s", node->type, node->content), NULL, NULL);
}

B_list nsdefs = B_listG_new(NULL, NULL);
xmlNsPtr nsDef = node->nsDef;
Expand Down Expand Up @@ -156,25 +170,21 @@ stdQ_xmlQ_Node stdQ_xmlQ_NodePtr2Node(xmlNodePtr node) {
B_list children = B_listG_new(NULL, NULL);
xmlNodePtr cur = node->xmlChildrenNode;

// Collect initial text/CDATA nodes
// Character data before the first child element becomes this node's text.
B_str text = collect_text_cdata_nodes(&cur);

// collect_text_cdata_nodes stops only at element nodes (cur is updated), so
// every node seen here is an element. Recurse into it, then collect the
// character data that follows it (up to the next element) as that child's
// tail.
while (cur != NULL) {
stdQ_xmlQ_Node child = stdQ_xmlQ_NodePtr2Node(cur);
if (child)
wit->$class->append(wit,children, child);
cur = cur->next;
child->tail = collect_text_cdata_nodes(&cur);
wit->$class->append(wit, children, child);
}

// Collect tail text/CDATA nodes after we have exhausted the child nodes
cur = node->next;
B_str tail = collect_text_cdata_nodes(&cur);
// Update the tree structure to skip consumed tail text/CDATA nodes.
// This prevents the parent from seeing these text nodes again during its
// child iteration, since tail text of an element is part of the parent's
// child list in the XML tree.
node->next = cur;
return (stdQ_xmlQ_Node)$NEW(stdQ_xmlQ_Node, to$str((char *)node->name), nsdefs, prefix, attributes, children, text, tail);
return (stdQ_xmlQ_Node)$NEW(stdQ_xmlQ_Node, to$str((char *)node->name), nsdefs, prefix, attributes, children, text, NULL);
}

stdQ_xmlQ_Node stdQ_xmlQ_decode(B_str data) {
Expand Down Expand Up @@ -217,6 +227,10 @@ stdQ_xmlQ_Node stdQ_xmlQ_decode(B_str data) {
RAISE(stdQ_xmlQ_XmlParseError, errmsg, line, column);
}
xmlNodePtr root = xmlDocGetRootElement(doc);
if (!root) {
xmlFreeDoc(doc);
RAISE(stdQ_xmlQ_XmlParseError, to$str("Document has no root element"), NULL, NULL);
}
stdQ_xmlQ_Node t = stdQ_xmlQ_NodePtr2Node(root);
xmlFreeDoc(doc);
return t;
Expand Down
7 changes: 7 additions & 0 deletions test/stdlib_tests/src/test_xml.act
Original file line number Diff line number Diff line change
Expand Up @@ -213,3 +213,10 @@ def _test_xml_pretty():
)

return root.encode(True)

def _test_xml_comment():
sn = r"""<data><!-- comment -->b</data>"""
d = xml.decode(sn)
e = xml.encode(d)
expected = r"""<data>b</data>"""
testing.assertEqual(e, expected, "comment should be removed but node text preserved")