Skip to content

Commit 65d5096

Browse files
committed
Fix .text, .iter for HTML fragments
Given multiple nodes at the root level.
1 parent 912e92f commit 65d5096

3 files changed

Lines changed: 19 additions & 11 deletions

File tree

CHANGES.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# selectolax Changelog
22

3+
- Fix `.text()` and `iter()` for HTML fragments when there are multiple nodes at the root level.
4+
35
# Version 0.4.6
46

57

selectolax/lexbor/node.pxi

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,8 @@ cdef class LexborNode:
168168
169169
"""
170170
cdef unsigned char * text
171-
cdef lxb_dom_node_t * node = <lxb_dom_node_t *> self.node.first_child
171+
cdef LexborNode start_node = self._get_node()
172+
cdef lxb_dom_node_t * node = <lxb_dom_node_t *> start_node.node.first_child
172173

173174
if not deep:
174175
container = TextContainer(separator, strip)
@@ -197,7 +198,7 @@ cdef class LexborNode:
197198
container.append(text.decode(_ENCODING))
198199

199200
lxb_dom_node_simple_walk(
200-
<lxb_dom_node_t *> self.node,
201+
<lxb_dom_node_t *> start_node.node,
201202
<lxb_dom_node_simple_walker_f> text_callback,
202203
<void *> container
203204
)
@@ -468,7 +469,8 @@ cdef class LexborNode:
468469
to the provided options.
469470
"""
470471

471-
cdef lxb_dom_node_t *node = self.node.first_child
472+
cdef LexborNode start_node = self._get_node()
473+
cdef lxb_dom_node_t *node = start_node.node.first_child
472474
cdef LexborNode next_node
473475

474476
while node != NULL:

tests/test_lexbor_fragment.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from inspect import cleandoc
22
import pytest
3-
from selectolax.lexbor import LexborHTMLParser, SelectolaxError
3+
from selectolax.lexbor import LexborHTMLParser
44

55

66
def clean_doc(text: str) -> str:
@@ -491,10 +491,14 @@ def test_fragment_create_node_with_attributes():
491491
assert 'class="link"' in html
492492

493493

494-
def test_fragment_create_node_empty_tag_name():
495-
parser = LexborHTMLParser("<div></div>", is_fragment=True)
496-
try:
497-
parser.create_node("")
498-
assert False, "Should have raised an exception"
499-
except SelectolaxError:
500-
pass
494+
def test_fragment_text_extraction_multiple_nodes():
495+
html = "<p>1</p><p>2</p>"
496+
p = LexborHTMLParser(html, is_fragment=True)
497+
assert p.text(deep=False) == ""
498+
assert p.text(deep=True, separator=" ", strip=True) == "1 2"
499+
500+
501+
def test_fragment_iter_multiple_nodes():
502+
html = "<p>1</p><p>2</p>"
503+
p = LexborHTMLParser(html, is_fragment=True)
504+
assert len(list(p.root.iter())) == 2

0 commit comments

Comments
 (0)