Add "unwrap empty tag" functinlality (#161)

palandovalex · web-flow · commit 31a9cb3bd683 · 2025-04-30T14:36:02.000+04:00
* Update node.pxi

fix unwraping in 'lexbor' wrapper

* Update node.pxi

change modest unwrap

* Update node.pxi

fix spelling

* Update test_nodes.py

Testing new unwrap mechanics
diff --git a/selectolax/lexbor/node.pxi b/selectolax/lexbor/node.pxi
@@ -416,7 +416,7 @@ cdef class LexborNode:
             node = node.next
 
 
-    def unwrap(self):
+    def unwrap(self, delete_empty=False):
         """Replace node with whatever is inside this node.
 
         Examples
@@ -426,9 +426,12 @@ cdef class LexborNode:
         >>>  tree.css_first('i').unwrap()
         >>>  tree.html
         '<html><head></head><body><div>Hello world!</div></body></html>'
-
+        
+        Note: by default, empty tags are ignored, use "delete_empty" to change this.
         """
         if self.node.first_child == NULL:
+            if delete_empty:
+                lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
             return
         cdef lxb_dom_node_t* next_node;
         cdef lxb_dom_node_t* current_node;
@@ -445,7 +448,7 @@ cdef class LexborNode:
             lxb_dom_node_insert_before(self.node, self.node.first_child)
         lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
 
-    def unwrap_tags(self, list tags):
+    def unwrap_tags(self, list tags, delete_empty=False):
         """Unwraps specified tags from the HTML tree.
 
         Works the same as the ``unwrap`` method, but applied to a list of tags.
@@ -462,11 +465,13 @@ cdef class LexborNode:
         >>> tree.body.unwrap_tags(['i','a'])
         >>> tree.body.html
         '<body><div>Hello world!</div></body>'
+        
+        Note: by default, empty tags are ignored, use "delete_empty" to change this.
         """
 
         for tag in tags:
             for element in self.css(tag):
-                element.unwrap()
+                element.unwrap(delete_empty)
 
 
     def traverse(self, include_text=False):
diff --git a/selectolax/modest/node.pxi b/selectolax/modest/node.pxi
@@ -515,7 +515,7 @@ cdef class Node:
         """An alias for the decompose method."""
         self.decompose(recursive)
 
-    def unwrap(self):
+    def unwrap(self, delete_empty=False):
         """Replace node with whatever is inside this node.
 
         Examples
@@ -526,8 +526,11 @@ cdef class Node:
         >>>  tree.html
         '<html><head></head><body><div>Hello world!</div></body></html>'
 
+        Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
         """
         if self.node.child == NULL:
+            if delete_empty:
+                myhtml_node_delete(self.node)
             return
         cdef myhtml_tree_node_t* next_node;
         cdef myhtml_tree_node_t* current_node;
@@ -568,7 +571,7 @@ cdef class Node:
             for element in self.css(tag):
                 element.decompose(recursive=recursive)
 
-    def unwrap_tags(self, list tags):
+    def unwrap_tags(self, list tags, delete_empty=False):
         """Unwraps specified tags from the HTML tree.
 
         Works the same as the ``unwrap`` method, but applied to a list of tags.
@@ -585,11 +588,13 @@ cdef class Node:
         >>> tree.body.unwrap_tags(['i','a'])
         >>> tree.body.html
         '<body><div>Hello world!</div></body>'
+        
+        Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
         """
 
         for tag in tags:
             for element in self.css(tag):
-                element.unwrap()
+                element.unwrap(delete_empty)
 
     def replace_with(self, str_or_Node value):
         """Replace current Node with specified value.
@@ -752,7 +757,7 @@ cdef class Node:
         else:
             raise TypeError("Expected a string or Node instance, but %s found" % type(value).__name__)
 
-    def unwrap_tags(self, list tags):
+    def unwrap_tags(self, list_tags, delete_empty=False):
         """Unwraps specified tags from the HTML tree.
 
         Works the same as th ``unwrap`` method, but applied to a list of tags.
@@ -769,11 +774,13 @@ cdef class Node:
         >>> tree.body.unwrap_tags(['i','a'])
         >>> tree.body.html
         '<body><div>Hello world!</div></body>'
+        
+        Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
         """
 
         for tag in tags:
             for element in self.css(tag):
-                element.unwrap()
+                element.unwrap(delete_empty)
 
     @property
     def raw_value(self):
diff --git a/tests/test_nodes.py b/tests/test_nodes.py
@@ -273,13 +273,29 @@ def test_unwrap(parser):
     assert html_parser.body.child.html == '<a id="url" href="https://rushter.com/">I linked to rushter.com</a>'
 
 
+@pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
+def test_unwrap_empty_tag(parser):
+    html = '<a id="url" href="https://rushter.com/">I linked to rushter.com<i></i></a>'
+    html_parser = parser(html)
+    node = html_parser.css_first('i')
+    node.unwrap(delete_empty=True)
+    assert html_parser.body.child.html == '<a id="url" href="https://rushter.com/">I linked to rushter.com</a>'
+
+
 @pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
 def test_unwrap_tags(parser):
     html_parser = parser("<div><a href="">Hello</a> <i>world</i>!</div>")
     html_parser.body.unwrap_tags(['i', 'a'])
     assert html_parser.body.html == '<body><div>Hello world!</div></body>'
 
 
+@pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
+def test_unwrap_empty_tags(parser):
+    html_parser = parser("<div><a href="">Hello</a> <i>world</i>!<i></i><a></a></div>")
+    html_parser.body.unwrap_tags(['i', 'a'])
+    assert html_parser.body.html == '<body><div>Hello world!</div></body>'
+    
+
 @pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
 def test_unwraps_multiple_child_nodes(parser):
     html = """
@@ -291,6 +307,18 @@ def test_unwraps_multiple_child_nodes(parser):
     html_parser.body.unwrap_tags(['span', 'i'])
     assert html_parser.body.child.html == '<div id="test">\n        foo bar Lorems I dummy <div>text</div>\n    </div>'
 
+ 
+@pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
+def test_unwraps_multiple_child_nodes_with_empty(parser):
+    html = """
+    <div id="test">
+        foo <span>bar <i>Lor<span>ems</span></i> I <span class='p3'>dummy<span><i></i></span> <div>text</div></span></span>
+    </div>
+    """
+    html_parser = parser(html)
+    html_parser.body.unwrap_tags(['span', 'i'], delete_empty=True)
+    assert html_parser.body.child.html == '<div id="test">\n        foo bar Lorems I dummy <div>text</div>\n    </div>'
+
 
 @pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
 def test_replace_with(parser):