@@ -68,56 +68,70 @@ static unsigned char* copy_with_xml_escape(unsigned char *dst, B_str src, int es
6868 return dst ;
6969}
7070
71- // Helper function to collect text from consecutive TEXT and CDATA nodes
72- // Returns the combined string and updates the node pointer to the first non-text node
73- // Note: cur_ptr is passed by reference (pointer to pointer) so we can update the caller's pointer
74- // to skip past all consumed text/CDATA nodes
71+ // Collect character data (text and CDATA) starting at *cur_ptr, up to but not
72+ // including the next element node. Comments are skipped over transparently, so
73+ // text on either side of a comment is concatenated (including whitespace).
74+ // Unsupported node types are rejected.
75+ // On return *cur_ptr points at the next element node, or NULL if the siblings
76+ // are exhausted; callers can therefore resume iterating element children
77+ // directly.
78+ //
79+ // Returns the combined text, or NULL if there was no character data (only
80+ // comments or nothing at all).
81+ //
82+ // Note: cur_ptr is passed by reference (pointer to pointer) so we can advance
83+ // the caller's cursor past everything we consumed.
7584static B_str collect_text_cdata_nodes (xmlNodePtr * cur_ptr ) {
85+ // First pass: sum the length of all text/CDATA content and locate the
86+ // element node (or end of siblings) where collection stops.
87+ size_t text_len = 0 ;
7688 xmlNodePtr cur = * cur_ptr ;
77- if (!cur || (cur -> type != XML_TEXT_NODE && cur -> type != XML_CDATA_SECTION_NODE )) {
78- return NULL ;
89+ while (cur && cur -> type != XML_ELEMENT_NODE ) {
90+ switch (cur -> type ) {
91+ case XML_TEXT_NODE :
92+ case XML_CDATA_SECTION_NODE :
93+ if (cur -> content )
94+ text_len += strlen ((char * )cur -> content );
95+ break ;
96+ case XML_COMMENT_NODE :
97+ break ;
98+ default :
99+ RAISE (stdQ_xmlQ_XmlParseError , $FORMAT ("Unsupported XML node type %d" , cur -> type ), NULL , NULL );
100+ }
101+ cur = cur -> next ;
79102 }
103+ xmlNodePtr stop = cur ;
80104
81- // Count total length of combined text and CDATA nodes
82- size_t text_len = 0 ;
83- xmlNodePtr text_start = cur ;
84- while (cur && (cur -> type == XML_TEXT_NODE || cur -> type == XML_CDATA_SECTION_NODE )) {
85- if (cur -> content ) text_len += strlen ((char * )cur -> content );
86- cur = cur -> next ;
105+ if (text_len == 0 ) {
106+ * cur_ptr = stop ;
107+ return NULL ;
87108 }
88109
89- // Create combined string
90- if (text_len > 0 ) {
91- char * combined = acton_malloc_atomic (text_len + 1 );
92- char * p = combined ;
93- xmlNodePtr t = text_start ;
94- while (t != cur ) {
95- if (t -> content ) {
96- size_t len = strlen ((char * )t -> content );
97- memcpy (p , t -> content , len );
98- p += len ;
99- }
100- t = t -> next ;
110+ // Second pass: concatenate the text/CDATA content into a single string.
111+ char * combined = acton_malloc_atomic (text_len + 1 );
112+ char * p = combined ;
113+ for (cur = * cur_ptr ; cur != stop ; cur = cur -> next ) {
114+ if ((cur -> type == XML_TEXT_NODE || cur -> type == XML_CDATA_SECTION_NODE ) && cur -> content ) {
115+ size_t len = strlen ((char * )cur -> content );
116+ memcpy (p , cur -> content , len );
117+ p += len ;
101118 }
102- * p = '\0' ;
103- B_str result = to_str_noc (combined );
104- * cur_ptr = cur ;
105- return result ;
106119 }
120+ * p = '\0' ;
107121
108- * cur_ptr = cur ;
109- return NULL ;
122+ * cur_ptr = stop ;
123+ return to_str_noc ( combined ) ;
110124}
111125
126+ // Convert a libxml2 element node into an Acton xml.Node.
127+ //
128+ // The returned Node has tail == NULL. An element's tail (the character data
129+ // following it, up to its next sibling element) belongs to the parent's child
130+ // sequence, so it is filled in by the caller while iterating siblings.
112131stdQ_xmlQ_Node stdQ_xmlQ_NodePtr2Node (xmlNodePtr node ) {
113132 B_SequenceD_list wit = B_SequenceD_listG_witness ;
114- if (node -> type == XML_COMMENT_NODE ) {
115- return NULL ;
116- }
117- if (node -> type != XML_ELEMENT_NODE ) {
118- char * errmsg = NULL ;
133+ if (node -> type != XML_ELEMENT_NODE )
119134 RAISE (stdQ_xmlQ_XmlParseError , $FORMAT ("Unexpected nodetype %d, content is %s" , node -> type , node -> content ), NULL , NULL );
120- }
121135
122136 B_list nsdefs = B_listG_new (NULL , NULL );
123137 xmlNsPtr nsDef = node -> nsDef ;
@@ -156,25 +170,21 @@ stdQ_xmlQ_Node stdQ_xmlQ_NodePtr2Node(xmlNodePtr node) {
156170 B_list children = B_listG_new (NULL , NULL );
157171 xmlNodePtr cur = node -> xmlChildrenNode ;
158172
159- // Collect initial text/CDATA nodes
173+ // Character data before the first child element becomes this node's text.
160174 B_str text = collect_text_cdata_nodes (& cur );
161175
176+ // collect_text_cdata_nodes stops only at element nodes (cur is updated), so
177+ // every node seen here is an element. Recurse into it, then collect the
178+ // character data that follows it (up to the next element) as that child's
179+ // tail.
162180 while (cur != NULL ) {
163181 stdQ_xmlQ_Node child = stdQ_xmlQ_NodePtr2Node (cur );
164- if (child )
165- wit -> $class -> append (wit ,children , child );
166182 cur = cur -> next ;
183+ child -> tail = collect_text_cdata_nodes (& cur );
184+ wit -> $class -> append (wit , children , child );
167185 }
168186
169- // Collect tail text/CDATA nodes after we have exhausted the child nodes
170- cur = node -> next ;
171- B_str tail = collect_text_cdata_nodes (& cur );
172- // Update the tree structure to skip consumed tail text/CDATA nodes.
173- // This prevents the parent from seeing these text nodes again during its
174- // child iteration, since tail text of an element is part of the parent's
175- // child list in the XML tree.
176- node -> next = cur ;
177- return (stdQ_xmlQ_Node )$NEW (stdQ_xmlQ_Node , to$str ((char * )node -> name ), nsdefs , prefix , attributes , children , text , tail );
187+ return (stdQ_xmlQ_Node )$NEW (stdQ_xmlQ_Node , to$str ((char * )node -> name ), nsdefs , prefix , attributes , children , text , NULL );
178188}
179189
180190stdQ_xmlQ_Node stdQ_xmlQ_decode (B_str data ) {
@@ -217,6 +227,10 @@ stdQ_xmlQ_Node stdQ_xmlQ_decode(B_str data) {
217227 RAISE (stdQ_xmlQ_XmlParseError , errmsg , line , column );
218228 }
219229 xmlNodePtr root = xmlDocGetRootElement (doc );
230+ if (!root ) {
231+ xmlFreeDoc (doc );
232+ RAISE (stdQ_xmlQ_XmlParseError , to$str ("Document has no root element" ), NULL , NULL );
233+ }
220234 stdQ_xmlQ_Node t = stdQ_xmlQ_NodePtr2Node (root );
221235 xmlFreeDoc (doc );
222236 return t ;
0 commit comments