Skip to content

Commit 112d30d

Browse files
committed
Stability improvements
1 parent f29d60d commit 112d30d

6 files changed

Lines changed: 621 additions & 26 deletions

File tree

selectolax/lexbor.pxd

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,7 @@ cdef class LexborNode:
246246
@staticmethod
247247
cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser)
248248
cdef void set_as_fragment_root(self)
249+
cdef inline LexborNode _get_node(self)
249250

250251

251252
cdef bint is_empty_text_node(lxb_dom_node_t *node)
@@ -266,7 +267,7 @@ cdef class LexborCSSSelector:
266267

267268
cdef class LexborHTMLParser:
268269
cdef lxb_html_document_t *document
269-
cdef lxb_html_document_t *_original_document
270+
cdef lxb_html_document_t *_fragment_document
270271
cdef bint _is_fragment
271272
cdef public bytes raw_html
272273
cdef LexborCSSSelector _selector
@@ -279,7 +280,7 @@ cdef class LexborHTMLParser:
279280

280281
@staticmethod
281282
cdef LexborHTMLParser from_document(lxb_html_document_t * document, bytes raw_html)
282-
cdef inline lxb_html_document_t* main_document(self)
283+
cdef inline lxb_html_document_t* main_document(self) nogil
283284

284285
cdef extern from "lexbor/dom/dom.h" nogil:
285286
ctypedef enum lexbor_action_t:

selectolax/lexbor.pyx

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -48,16 +48,16 @@ cdef class LexborHTMLParser:
4848
cdef size_t html_len
4949
cdef object bytes_html
5050
self._is_fragment = is_fragment
51-
self._original_document = NULL
51+
self._fragment_document = NULL
5252
self._selector = None
5353
self._new_html_document()
5454
bytes_html, html_len = preprocess_input(html)
5555
self._parse_html(bytes_html, html_len)
5656
self.raw_html = bytes_html
5757

58-
cdef inline lxb_html_document_t* main_document(self):
58+
cdef inline lxb_html_document_t* main_document(self) nogil:
5959
if self._is_fragment:
60-
return self._original_document
60+
return self._fragment_document
6161
else:
6262
return self.document
6363

@@ -180,9 +180,7 @@ cdef class LexborHTMLParser:
180180
if fragment_html_node == NULL:
181181
return LXB_STATUS_ERROR
182182

183-
# Use the fragment document returned by lexbor as the parser document.
184-
self._original_document = self.document
185-
self.document = <lxb_html_document_t *> fragment_html_node
183+
self._fragment_document = <lxb_html_document_t *> fragment_html_node
186184
return LXB_STATUS_OK
187185

188186
def __dealloc__(self):
@@ -197,10 +195,10 @@ cdef class LexborHTMLParser:
197195
Safe to call multiple times; does nothing if the document is already
198196
freed.
199197
"""
198+
if self._fragment_document != NULL:
199+
lxb_html_document_destroy(self._fragment_document)
200200
if self.document != NULL:
201201
lxb_html_document_destroy(self.document)
202-
if self._original_document != NULL:
203-
lxb_html_document_destroy(self._original_document)
204202

205203
def __repr__(self):
206204
"""Return a concise representation of the parsed document.
@@ -238,7 +236,14 @@ cdef class LexborHTMLParser:
238236
if self.document == NULL:
239237
return None
240238
cdef LexborNode node
241-
node = LexborNode.new(<lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document), self)
239+
cdef lxb_dom_node_t* dom_root
240+
if self._is_fragment and self._fragment_document != NULL:
241+
dom_root = lxb_dom_document_root(&self._fragment_document.dom_document)
242+
else:
243+
dom_root = lxb_dom_document_root(&self.document.dom_document)
244+
if dom_root == NULL:
245+
return None
246+
node = LexborNode.new(dom_root, self)
242247
if self._is_fragment:
243248
node.set_as_fragment_root()
244249
return node
@@ -370,6 +375,8 @@ cdef class LexborHTMLParser:
370375
if self.document == NULL:
371376
return None
372377
if self._is_fragment:
378+
if self.root is None:
379+
return None
373380
return self.root.html
374381
node = LexborNode.new(<lxb_dom_node_t *> &self.document.dom_document, self)
375382
return node.html
@@ -626,7 +633,7 @@ cdef class LexborHTMLParser:
626633
with nogil:
627634
cloned_node = lxb_dom_document_import_node(
628635
&cloned_document.dom_document,
629-
<lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document),
636+
<lxb_dom_node_t *> lxb_dom_document_root(&self.main_document().dom_document),
630637
<bint> True
631638
)
632639

selectolax/lexbor/node.pxi

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ cdef class LexborNode:
114114
lxb_str = lexbor_str_create()
115115
if self._is_fragment_root:
116116
status = serialize_fragment(self.node, lxb_str)
117+
# status = lxb_html_serialize_tree_str(self.node, lxb_str)
117118
else:
118119
status = lxb_html_serialize_tree_str(self.node, lxb_str)
119120
if status == 0 and lxb_str.data:
@@ -202,6 +203,14 @@ cdef class LexborNode:
202203
)
203204
return container.text
204205

206+
cdef inline LexborNode _get_node(self):
207+
cdef LexborNode node
208+
if self._is_fragment_root:
209+
node = self.parent
210+
else:
211+
node = self
212+
return node
213+
205214
def css(self, str query):
206215
"""Evaluate CSS selector against current node and its child nodes.
207216
@@ -223,7 +232,7 @@ cdef class LexborNode:
223232
-------
224233
selector : list of `Node` objects
225234
"""
226-
return self.parser.selector.find(query, self)
235+
return self.parser.selector.find(query, self._get_node())
227236

228237
def css_first(self, str query, default=None, bool strict=False):
229238
"""Same as `css` but returns only the first match.
@@ -245,9 +254,9 @@ cdef class LexborNode:
245254
selector : `LexborNode` object
246255
"""
247256
if strict:
248-
results = self.parser.selector.find(query, self)
257+
results = self.parser.selector.find(query, self._get_node())
249258
else:
250-
results = self.parser.selector.find_first(query, self)
259+
results = self.parser.selector.find_first(query, self._get_node())
251260
n_results = len(results)
252261
if n_results > 0:
253262
if strict and n_results > 1:
@@ -664,7 +673,7 @@ cdef class LexborNode:
664673
if isinstance(value, (str, bytes, unicode)):
665674
bytes_val = to_bytes(value)
666675
new_node = <lxb_dom_node_t *> lxb_dom_document_create_text_node(
667-
&self.parser.main_document().dom_document,
676+
&self.parser.document.dom_document,
668677
<lxb_char_t *> bytes_val, len(bytes_val)
669678
)
670679
if new_node == NULL:
@@ -673,7 +682,7 @@ cdef class LexborNode:
673682
lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
674683
elif isinstance(value, LexborNode):
675684
new_node = lxb_dom_document_import_node(
676-
&self.parser.main_document().dom_document,
685+
&self.parser.document.dom_document,
677686
<lxb_dom_node_t *> value.node,
678687
<bint> True
679688
)
@@ -717,15 +726,15 @@ cdef class LexborNode:
717726
if isinstance(value, (str, bytes, unicode)):
718727
bytes_val = to_bytes(value)
719728
new_node = <lxb_dom_node_t *> lxb_dom_document_create_text_node(
720-
&self.parser.main_document().dom_document,
729+
&self.parser.document.dom_document,
721730
<lxb_char_t *> bytes_val, len(bytes_val)
722731
)
723732
if new_node == NULL:
724733
raise SelectolaxError("Can't create a new node")
725734
lxb_dom_node_insert_before(self.node, new_node)
726735
elif isinstance(value, LexborNode):
727736
new_node = lxb_dom_document_import_node(
728-
&self.parser.main_document().dom_document,
737+
&self.parser.document.dom_document,
729738
<lxb_dom_node_t *> value.node,
730739
<bint> True
731740
)
@@ -768,15 +777,15 @@ cdef class LexborNode:
768777
if isinstance(value, (str, bytes, unicode)):
769778
bytes_val = to_bytes(value)
770779
new_node = <lxb_dom_node_t *> lxb_dom_document_create_text_node(
771-
&self.parser.main_document().dom_document,
780+
&self.parser.document.dom_document,
772781
<lxb_char_t *> bytes_val, len(bytes_val)
773782
)
774783
if new_node == NULL:
775784
raise SelectolaxError("Can't create a new node")
776785
lxb_dom_node_insert_after(self.node, new_node)
777786
elif isinstance(value, LexborNode):
778787
new_node = lxb_dom_document_import_node(
779-
&self.parser.main_document().dom_document,
788+
&self.parser.document.dom_document,
780789
<lxb_dom_node_t *> value.node,
781790
<bint> True
782791
)
@@ -819,15 +828,15 @@ cdef class LexborNode:
819828
if isinstance(value, (str, bytes, unicode)):
820829
bytes_val = to_bytes(value)
821830
new_node = <lxb_dom_node_t *> lxb_dom_document_create_text_node(
822-
&self.parser.main_document().dom_document,
831+
&self.parser.document.dom_document,
823832
<lxb_char_t *> bytes_val, len(bytes_val)
824833
)
825834
if new_node == NULL:
826835
raise SelectolaxError("Can't create a new node")
827836
lxb_dom_node_insert_child(self.node, new_node)
828837
elif isinstance(value, LexborNode):
829838
new_node = lxb_dom_document_import_node(
830-
&self.parser.main_document().dom_document,
839+
&self.parser.document.dom_document,
831840
<lxb_dom_node_t *> value.node,
832841
<bint> True
833842
)
@@ -930,7 +939,7 @@ cdef class LexborNode:
930939
-------
931940
selector : The `Selector` class.
932941
"""
933-
return LexborSelector(self, query)
942+
return LexborSelector(self._get_node(), query)
934943

935944
def __eq__(self, other):
936945
if isinstance(other, str):

selectolax/lexbor/selection.pxi

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,8 @@ cdef class LexborSelector:
165165
cdef list nodes = []
166166
for node in self.nodes:
167167
attr = node.attributes.get(attribute)
168+
if not attr:
169+
continue
168170
if attr and start and start in attr:
169171
attr = attr[attr.find(start) + len(start):]
170172
if len(attr) > length:

0 commit comments

Comments
 (0)