Skip to content

Commit f96bd33

Browse files
committed
Improve merge_text_nodes
1 parent dc1b51d commit f96bd33

1 file changed

Lines changed: 45 additions & 26 deletions

File tree

selectolax/lexbor/node.pxi

Lines changed: 45 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -579,32 +579,7 @@ cdef class LexborNode:
579579
>>> tree.text(deep=True, separator=" ", strip=True)
580580
"John Doe"
581581
"""
582-
cdef lxb_dom_node_t *node = self.node.first_child
583-
cdef lxb_dom_node_t *next_node
584-
cdef lxb_char_t *left_text
585-
cdef lxb_char_t *right_text
586-
cdef size_t left_length, right_length
587-
588-
while node != NULL:
589-
next_node = node.next
590-
591-
if node.type == LXB_DOM_NODE_TYPE_TEXT:
592-
while next_node != NULL and next_node.type == LXB_DOM_NODE_TYPE_TEXT:
593-
left_text = lxb_dom_node_text_content(node, &left_length)
594-
right_text = lxb_dom_node_text_content(next_node, &right_length)
595-
596-
if left_text and right_text:
597-
combined = (<bytes> left_text[:left_length]) + (<bytes> right_text[:right_length])
598-
lxb_dom_node_text_content_set(node, combined, len(combined))
599-
lxb_dom_node_remove(next_node)
600-
601-
next_node = node.next
602-
else:
603-
break
604-
605-
if node.type == LXB_DOM_NODE_TYPE_ELEMENT and node.first_child:
606-
LexborNode.new(node, self.parser).merge_text_nodes()
607-
node = next_node
582+
_merge_text_nodes(self.node)
608583

609584
def traverse(self, bool include_text = False, bool skip_empty = False):
610585
"""Depth-first traversal starting at the current node.
@@ -1156,3 +1131,47 @@ cdef lxb_status_t serialize_fragment(lxb_dom_node_t *node, lexbor_str_t *lxb_str
11561131

11571132
cdef inline bint _is_node_type(lxb_dom_node_t *node, lxb_dom_node_type_t expected_type):
11581133
return node != NULL and node.type == expected_type
1134+
1135+
cdef void _merge_text_nodes(lxb_dom_node_t *root):
1136+
if root == NULL or node_is_removed(root):
1137+
return
1138+
1139+
cdef lxb_dom_node_t *node
1140+
cdef lxb_dom_node_t *next_node
1141+
cdef lxb_dom_text_t *new_text_node
1142+
cdef lxb_char_t *left_text
1143+
cdef lxb_char_t *right_text
1144+
cdef size_t left_length, right_length
1145+
cdef bytes combined
1146+
1147+
cdef bint changed = 1
1148+
while changed:
1149+
changed = 0
1150+
node = root.first_child
1151+
while node != NULL:
1152+
next_node = node.next
1153+
if node.type == LXB_DOM_NODE_TYPE_TEXT and next_node != NULL and next_node.type == LXB_DOM_NODE_TYPE_TEXT:
1154+
left_text = lxb_dom_node_text_content(node, &left_length)
1155+
right_text = lxb_dom_node_text_content(next_node, &right_length)
1156+
1157+
if left_text != NULL and right_text != NULL:
1158+
combined = (<bytes>left_text[:left_length]) + (<bytes>right_text[:right_length])
1159+
new_text_node = lxb_dom_document_create_text_node(
1160+
root.owner_document,
1161+
<lxb_char_t *>combined,
1162+
len(combined)
1163+
)
1164+
if new_text_node != NULL:
1165+
lxb_dom_node_insert_before(node, <lxb_dom_node_t *>new_text_node)
1166+
lxb_dom_node_remove(node)
1167+
lxb_dom_node_remove(next_node)
1168+
changed = 1
1169+
break
1170+
1171+
node = next_node
1172+
1173+
node = root.first_child
1174+
while node != NULL:
1175+
if node.type == LXB_DOM_NODE_TYPE_ELEMENT and node.first_child:
1176+
_merge_text_nodes(node)
1177+
node = node.next

0 commit comments

Comments
 (0)