Skip to content

Commit 31a9cb3

Browse files
authored
Add "unwrap empty tag" functinlality (#161)
* Update node.pxi fix unwraping in 'lexbor' wrapper * Update node.pxi change modest unwrap * Update node.pxi fix spelling * Update test_nodes.py Testing new unwrap mechanics
1 parent c303b5c commit 31a9cb3

3 files changed

Lines changed: 49 additions & 9 deletions

File tree

selectolax/lexbor/node.pxi

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -416,7 +416,7 @@ cdef class LexborNode:
416416
node = node.next
417417

418418

419-
def unwrap(self):
419+
def unwrap(self, delete_empty=False):
420420
"""Replace node with whatever is inside this node.
421421
422422
Examples
@@ -426,9 +426,12 @@ cdef class LexborNode:
426426
>>> tree.css_first('i').unwrap()
427427
>>> tree.html
428428
'<html><head></head><body><div>Hello world!</div></body></html>'
429-
429+
430+
Note: by default, empty tags are ignored, use "delete_empty" to change this.
430431
"""
431432
if self.node.first_child == NULL:
433+
if delete_empty:
434+
lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
432435
return
433436
cdef lxb_dom_node_t* next_node;
434437
cdef lxb_dom_node_t* current_node;
@@ -445,7 +448,7 @@ cdef class LexborNode:
445448
lxb_dom_node_insert_before(self.node, self.node.first_child)
446449
lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
447450

448-
def unwrap_tags(self, list tags):
451+
def unwrap_tags(self, list tags, delete_empty=False):
449452
"""Unwraps specified tags from the HTML tree.
450453
451454
Works the same as the ``unwrap`` method, but applied to a list of tags.
@@ -462,11 +465,13 @@ cdef class LexborNode:
462465
>>> tree.body.unwrap_tags(['i','a'])
463466
>>> tree.body.html
464467
'<body><div>Hello world!</div></body>'
468+
469+
Note: by default, empty tags are ignored, use "delete_empty" to change this.
465470
"""
466471

467472
for tag in tags:
468473
for element in self.css(tag):
469-
element.unwrap()
474+
element.unwrap(delete_empty)
470475

471476

472477
def traverse(self, include_text=False):

selectolax/modest/node.pxi

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -515,7 +515,7 @@ cdef class Node:
515515
"""An alias for the decompose method."""
516516
self.decompose(recursive)
517517

518-
def unwrap(self):
518+
def unwrap(self, delete_empty=False):
519519
"""Replace node with whatever is inside this node.
520520
521521
Examples
@@ -526,8 +526,11 @@ cdef class Node:
526526
>>> tree.html
527527
'<html><head></head><body><div>Hello world!</div></body></html>'
528528
529+
Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
529530
"""
530531
if self.node.child == NULL:
532+
if delete_empty:
533+
myhtml_node_delete(self.node)
531534
return
532535
cdef myhtml_tree_node_t* next_node;
533536
cdef myhtml_tree_node_t* current_node;
@@ -568,7 +571,7 @@ cdef class Node:
568571
for element in self.css(tag):
569572
element.decompose(recursive=recursive)
570573

571-
def unwrap_tags(self, list tags):
574+
def unwrap_tags(self, list tags, delete_empty=False):
572575
"""Unwraps specified tags from the HTML tree.
573576
574577
Works the same as the ``unwrap`` method, but applied to a list of tags.
@@ -585,11 +588,13 @@ cdef class Node:
585588
>>> tree.body.unwrap_tags(['i','a'])
586589
>>> tree.body.html
587590
'<body><div>Hello world!</div></body>'
591+
592+
Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
588593
"""
589594

590595
for tag in tags:
591596
for element in self.css(tag):
592-
element.unwrap()
597+
element.unwrap(delete_empty)
593598

594599
def replace_with(self, str_or_Node value):
595600
"""Replace current Node with specified value.
@@ -752,7 +757,7 @@ cdef class Node:
752757
else:
753758
raise TypeError("Expected a string or Node instance, but %s found" % type(value).__name__)
754759

755-
def unwrap_tags(self, list tags):
760+
def unwrap_tags(self, list_tags, delete_empty=False):
756761
"""Unwraps specified tags from the HTML tree.
757762
758763
Works the same as th ``unwrap`` method, but applied to a list of tags.
@@ -769,11 +774,13 @@ cdef class Node:
769774
>>> tree.body.unwrap_tags(['i','a'])
770775
>>> tree.body.html
771776
'<body><div>Hello world!</div></body>'
777+
778+
Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
772779
"""
773780

774781
for tag in tags:
775782
for element in self.css(tag):
776-
element.unwrap()
783+
element.unwrap(delete_empty)
777784

778785
@property
779786
def raw_value(self):

tests/test_nodes.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,13 +273,29 @@ def test_unwrap(parser):
273273
assert html_parser.body.child.html == '<a id="url" href="https://rushter.com/">I linked to rushter.com</a>'
274274

275275

276+
@pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
277+
def test_unwrap_empty_tag(parser):
278+
html = '<a id="url" href="https://rushter.com/">I linked to rushter.com<i></i></a>'
279+
html_parser = parser(html)
280+
node = html_parser.css_first('i')
281+
node.unwrap(delete_empty=True)
282+
assert html_parser.body.child.html == '<a id="url" href="https://rushter.com/">I linked to rushter.com</a>'
283+
284+
276285
@pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
277286
def test_unwrap_tags(parser):
278287
html_parser = parser("<div><a href="">Hello</a> <i>world</i>!</div>")
279288
html_parser.body.unwrap_tags(['i', 'a'])
280289
assert html_parser.body.html == '<body><div>Hello world!</div></body>'
281290

282291

292+
@pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
293+
def test_unwrap_empty_tags(parser):
294+
html_parser = parser("<div><a href="">Hello</a> <i>world</i>!<i></i><a></a></div>")
295+
html_parser.body.unwrap_tags(['i', 'a'])
296+
assert html_parser.body.html == '<body><div>Hello world!</div></body>'
297+
298+
283299
@pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
284300
def test_unwraps_multiple_child_nodes(parser):
285301
html = """
@@ -291,6 +307,18 @@ def test_unwraps_multiple_child_nodes(parser):
291307
html_parser.body.unwrap_tags(['span', 'i'])
292308
assert html_parser.body.child.html == '<div id="test">\n foo bar Lorems I dummy <div>text</div>\n </div>'
293309

310+
311+
@pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
312+
def test_unwraps_multiple_child_nodes_with_empty(parser):
313+
html = """
314+
<div id="test">
315+
foo <span>bar <i>Lor<span>ems</span></i> I <span class='p3'>dummy<span><i></i></span> <div>text</div></span></span>
316+
</div>
317+
"""
318+
html_parser = parser(html)
319+
html_parser.body.unwrap_tags(['span', 'i'], delete_empty=True)
320+
assert html_parser.body.child.html == '<div id="test">\n foo bar Lorems I dummy <div>text</div>\n </div>'
321+
294322

295323
@pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
296324
def test_replace_with(parser):

0 commit comments

Comments
 (0)