Skip to content

Commit a1b71ff

Browse files
committed
Add html_pretty method
1 parent e7c6674 commit a1b71ff

5 files changed

Lines changed: 260 additions & 1 deletion

File tree

selectolax/lexbor.pxd

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,10 @@ cdef extern from "lexbor/html/html.h" nogil:
247247

248248
lxb_status_t lxb_html_serialize_tree_str(lxb_dom_node_t *node, lexbor_str_t *str)
249249
lxb_status_t lxb_html_serialize_deep_str(lxb_dom_node_t *node, lexbor_str_t *str)
250+
lxb_status_t lxb_html_serialize_pretty_tree_str(lxb_dom_node_t *node,
251+
lxb_html_serialize_opt_t opt,
252+
size_t indent,
253+
lexbor_str_t *str)
250254
lxb_html_element_t* lxb_html_element_inner_html_set(lxb_html_element_t *element,
251255
const lxb_char_t *html, size_t size)
252256

@@ -259,6 +263,7 @@ cdef class LexborNode:
259263
@staticmethod
260264
cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser)
261265
cdef void set_as_fragment_root(self)
266+
cdef str _serialize_html(self, lxb_html_serialize_opt_t options, size_t indent, bint pretty)
262267
cdef inline LexborNode _get_node(self)
263268

264269

selectolax/lexbor.pyx

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,68 @@ cdef class LexborHTMLParser:
392392
node = LexborNode.new(<lxb_dom_node_t *> &self.document.dom_document, self)
393393
return node.html
394394

395+
def html_pretty(
396+
self,
397+
Py_ssize_t indent=0,
398+
bint skip_ws_nodes=False,
399+
bint skip_comment=False,
400+
bint raw=False,
401+
bint without_closing=False,
402+
bint tag_with_ns=False,
403+
bint without_text_indent=False,
404+
bint full_doctype=False,
405+
):
406+
"""Return pretty-printed HTML representation of the page.
407+
408+
Parameters
409+
----------
410+
indent : int, optional
411+
Initial indentation level passed to Lexbor. Defaults to ``0``.
412+
skip_ws_nodes : bool, optional
413+
Skip text nodes that contain only whitespace.
414+
skip_comment : bool, optional
415+
Exclude HTML comment nodes from the serialized output.
416+
raw : bool, optional
417+
Serialize text and attribute values without HTML escaping.
418+
without_closing : bool, optional
419+
Omit closing tags for non-void elements.
420+
tag_with_ns : bool, optional
421+
Include namespace prefixes in serialized tag names when available.
422+
without_text_indent : bool, optional
423+
Disable extra indentation added around text and comment content.
424+
full_doctype : bool, optional
425+
Serialize the full document type declaration when a doctype node is present.
426+
"""
427+
cdef lxb_html_serialize_opt_t options
428+
if self.document == NULL:
429+
return None
430+
if indent < 0:
431+
raise ValueError("indent must be greater than or equal to 0")
432+
options = _html_pretty_options(
433+
skip_ws_nodes,
434+
skip_comment,
435+
raw,
436+
without_closing,
437+
tag_with_ns,
438+
without_text_indent,
439+
full_doctype,
440+
)
441+
if self._is_fragment:
442+
if self.root is None:
443+
return None
444+
return self.root.html_pretty(
445+
indent=indent,
446+
skip_ws_nodes=skip_ws_nodes,
447+
skip_comment=skip_comment,
448+
raw=raw,
449+
without_closing=without_closing,
450+
tag_with_ns=tag_with_ns,
451+
without_text_indent=without_text_indent,
452+
full_doctype=full_doctype,
453+
)
454+
node = LexborNode.new(<lxb_dom_node_t *> &self.document.dom_document, self)
455+
return node._serialize_html(options, <size_t> indent, True)
456+
395457
def css(self, str query):
396458
"""A CSS selector.
397459

selectolax/lexbor/node.pxi

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,74 @@ cdef class LexborNode:
123123
return html
124124
return None
125125

126+
cdef inline str _serialize_html(self, lxb_html_serialize_opt_t options, size_t indent, bint pretty):
127+
cdef lexbor_str_t *lxb_str
128+
cdef lxb_status_t status
129+
130+
lxb_str = lexbor_str_create()
131+
if self._is_fragment_root:
132+
if pretty:
133+
status = serialize_fragment_pretty(self.node, lxb_str, options, indent)
134+
else:
135+
status = serialize_fragment(self.node, lxb_str)
136+
else:
137+
if pretty:
138+
status = lxb_html_serialize_pretty_tree_str(self.node, options, indent, lxb_str)
139+
else:
140+
status = lxb_html_serialize_tree_str(self.node, lxb_str)
141+
142+
if status == 0 and lxb_str.data:
143+
html = lxb_str.data.decode(_ENCODING).replace('<-undef>', '')
144+
lexbor_str_destroy(lxb_str, self.node.owner_document.text, True)
145+
return html
146+
return None
147+
148+
def html_pretty(
149+
self,
150+
Py_ssize_t indent=0,
151+
bint skip_ws_nodes=False,
152+
bint skip_comment=False,
153+
bint raw=False,
154+
bint without_closing=False,
155+
bint tag_with_ns=False,
156+
bint without_text_indent=False,
157+
bint full_doctype=False,
158+
):
159+
"""Return pretty-printed HTML for the current node.
160+
161+
Parameters
162+
----------
163+
indent : int, optional
164+
Initial indentation level passed to Lexbor. Defaults to ``0``.
165+
skip_ws_nodes : bool, optional
166+
Skip text nodes that contain only whitespace.
167+
skip_comment : bool, optional
168+
Exclude HTML comment nodes from the serialized output.
169+
raw : bool, optional
170+
Serialize text and attribute values without HTML escaping.
171+
without_closing : bool, optional
172+
Omit closing tags for non-void elements.
173+
tag_with_ns : bool, optional
174+
Include namespace prefixes in serialized tag names when available.
175+
without_text_indent : bool, optional
176+
Disable extra indentation added around text and comment content.
177+
full_doctype : bool, optional
178+
Serialize the full document type declaration when a doctype node is present.
179+
"""
180+
cdef lxb_html_serialize_opt_t options
181+
if indent < 0:
182+
raise ValueError("indent must be greater than or equal to 0")
183+
options = _html_pretty_options(
184+
skip_ws_nodes,
185+
skip_comment,
186+
raw,
187+
without_closing,
188+
tag_with_ns,
189+
without_text_indent,
190+
full_doctype,
191+
)
192+
return self._serialize_html(options, <size_t> indent, True)
193+
126194
def __hash__(self):
127195
return self.mem_id
128196

@@ -1129,6 +1197,51 @@ cdef lxb_status_t serialize_fragment(lxb_dom_node_t *node, lexbor_str_t *lxb_str
11291197

11301198
return LXB_STATUS_OK
11311199

1200+
1201+
cdef lxb_status_t serialize_fragment_pretty(
1202+
lxb_dom_node_t *node,
1203+
lexbor_str_t *lxb_str,
1204+
lxb_html_serialize_opt_t options,
1205+
size_t indent,
1206+
):
1207+
cdef lxb_status_t status
1208+
while node != NULL:
1209+
status = lxb_html_serialize_pretty_tree_str(node, options, indent, lxb_str)
1210+
if status != LXB_STATUS_OK:
1211+
return status
1212+
node = node.next
1213+
1214+
return LXB_STATUS_OK
1215+
1216+
1217+
cdef inline lxb_html_serialize_opt_t _html_pretty_options(
1218+
bint skip_ws_nodes,
1219+
bint skip_comment,
1220+
bint raw,
1221+
bint without_closing,
1222+
bint tag_with_ns,
1223+
bint without_text_indent,
1224+
bint full_doctype,
1225+
):
1226+
cdef lxb_html_serialize_opt_t options = LXB_HTML_SERIALIZE_OPT_UNDEF
1227+
1228+
if skip_ws_nodes:
1229+
options = <lxb_html_serialize_opt_t> (options | LXB_HTML_SERIALIZE_OPT_SKIP_WS_NODES)
1230+
if skip_comment:
1231+
options = <lxb_html_serialize_opt_t> (options | LXB_HTML_SERIALIZE_OPT_SKIP_COMMENT)
1232+
if raw:
1233+
options = <lxb_html_serialize_opt_t> (options | LXB_HTML_SERIALIZE_OPT_RAW)
1234+
if without_closing:
1235+
options = <lxb_html_serialize_opt_t> (options | LXB_HTML_SERIALIZE_OPT_WITHOUT_CLOSING)
1236+
if tag_with_ns:
1237+
options = <lxb_html_serialize_opt_t> (options | LXB_HTML_SERIALIZE_OPT_TAG_WITH_NS)
1238+
if without_text_indent:
1239+
options = <lxb_html_serialize_opt_t> (options | LXB_HTML_SERIALIZE_OPT_WITHOUT_TEXT_INDENT)
1240+
if full_doctype:
1241+
options = <lxb_html_serialize_opt_t> (options | LXB_HTML_SERIALIZE_OPT_FULL_DOCTYPE)
1242+
1243+
return options
1244+
11321245
cdef inline bint _is_node_type(lxb_dom_node_t *node, lxb_dom_node_type_t expected_type):
11331246
return node != NULL and node.type == expected_type
11341247

tests/test_lexbor.py

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22

33
from inspect import cleandoc
44

5+
import pytest
56

6-
from selectolax.lexbor import LexborHTMLParser, parse_fragment, SelectolaxError
7+
8+
from selectolax.lexbor import LexborHTMLParser, SelectolaxError, parse_fragment
79

810

911
def clean_doc(text: str) -> str:
@@ -27,6 +29,58 @@ def test_sets_inner_html():
2729
assert actual == expected
2830

2931

32+
def test_html_pretty_document():
33+
parser = LexborHTMLParser("<div><span>Hello</span><!-- note --></div>")
34+
assert parser.html_pretty() == clean_doc(
35+
"""
36+
<html>
37+
<head>
38+
</head>
39+
<body>
40+
<div>
41+
<span>
42+
"Hello"
43+
</span>
44+
<!-- note -->
45+
</div>
46+
</body>
47+
</html>
48+
"""
49+
)
50+
51+
52+
def test_html_pretty_node_with_options():
53+
parser = LexborHTMLParser("<div><span>Hello</span><!-- note --></div>")
54+
node = parser.css_first("div")
55+
assert node.html_pretty(skip_comment=True) == clean_doc(
56+
"""
57+
<div>
58+
<span>
59+
"Hello"
60+
</span>
61+
</div>
62+
"""
63+
)
64+
65+
66+
def test_html_pretty_skip_ws_nodes_option():
67+
parser = LexborHTMLParser("<div>\n</div><span></span>", is_fragment=True)
68+
assert parser.html_pretty(skip_ws_nodes=True) == clean_doc(
69+
"""
70+
<div>
71+
</div>
72+
<span>
73+
</span>
74+
"""
75+
)
76+
77+
78+
def test_html_pretty_rejects_negative_indent():
79+
parser = LexborHTMLParser("<div>Hello</div>")
80+
with pytest.raises(ValueError):
81+
parser.html_pretty(indent=-1)
82+
83+
3084
def test_checking_attributes_does_not_segfault():
3185
parser = LexborHTMLParser("")
3286
root_node = parser.root

tests/test_lexbor_fragment.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,31 @@ def test_fragment_root_html_serialization():
149149
assert p.html == "<div>Hello!</div><span>World</span>"
150150

151151

152+
def test_fragment_root_html_pretty_serialization():
153+
html = "<div><span>Hello</span></div>\n<span>World</span>"
154+
p = LexborHTMLParser(html, is_fragment=True)
155+
assert p.root.html_pretty(skip_ws_nodes=True) == clean_doc(
156+
"""
157+
<div>
158+
<span>
159+
</span>
160+
</div>
161+
<span>
162+
</span>
163+
"""
164+
)
165+
assert p.html_pretty(skip_ws_nodes=True) == clean_doc(
166+
"""
167+
<div>
168+
<span>
169+
</span>
170+
</div>
171+
<span>
172+
</span>
173+
"""
174+
)
175+
176+
152177
def test_fragment_node_properties():
153178
html = "<div>Hello</div><span>World</span>"
154179
p = LexborHTMLParser(html, is_fragment=True)

0 commit comments

Comments
 (0)