Add html_pretty method

rushter · rushter · commit a1b71ff7c746 · 2026-05-04T17:57:49.000+04:00
diff --git a/selectolax/lexbor.pxd b/selectolax/lexbor.pxd
@@ -247,6 +247,10 @@ cdef extern from "lexbor/html/html.h" nogil:
 
     lxb_status_t lxb_html_serialize_tree_str(lxb_dom_node_t *node, lexbor_str_t *str)
     lxb_status_t lxb_html_serialize_deep_str(lxb_dom_node_t *node, lexbor_str_t *str)
+    lxb_status_t lxb_html_serialize_pretty_tree_str(lxb_dom_node_t *node,
+                                                    lxb_html_serialize_opt_t opt,
+                                                    size_t indent,
+                                                    lexbor_str_t *str)
     lxb_html_element_t* lxb_html_element_inner_html_set(lxb_html_element_t *element,
                                                         const lxb_char_t *html, size_t size)
 
@@ -259,6 +263,7 @@ cdef class LexborNode:
     @staticmethod
     cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser)
     cdef void set_as_fragment_root(self)
+    cdef str _serialize_html(self, lxb_html_serialize_opt_t options, size_t indent, bint pretty)
     cdef inline LexborNode _get_node(self)
 
 
diff --git a/selectolax/lexbor.pyx b/selectolax/lexbor.pyx
@@ -392,6 +392,68 @@ cdef class LexborHTMLParser:
         node = LexborNode.new(<lxb_dom_node_t *> &self.document.dom_document, self)
         return node.html
 
+    def html_pretty(
+        self,
+        Py_ssize_t indent=0,
+        bint skip_ws_nodes=False,
+        bint skip_comment=False,
+        bint raw=False,
+        bint without_closing=False,
+        bint tag_with_ns=False,
+        bint without_text_indent=False,
+        bint full_doctype=False,
+    ):
+        """Return pretty-printed HTML representation of the page.
+
+        Parameters
+        ----------
+        indent : int, optional
+            Initial indentation level passed to Lexbor. Defaults to ``0``.
+        skip_ws_nodes : bool, optional
+            Skip text nodes that contain only whitespace.
+        skip_comment : bool, optional
+            Exclude HTML comment nodes from the serialized output.
+        raw : bool, optional
+            Serialize text and attribute values without HTML escaping.
+        without_closing : bool, optional
+            Omit closing tags for non-void elements.
+        tag_with_ns : bool, optional
+            Include namespace prefixes in serialized tag names when available.
+        without_text_indent : bool, optional
+            Disable extra indentation added around text and comment content.
+        full_doctype : bool, optional
+            Serialize the full document type declaration when a doctype node is present.
+        """
+        cdef lxb_html_serialize_opt_t options
+        if self.document == NULL:
+            return None
+        if indent < 0:
+            raise ValueError("indent must be greater than or equal to 0")
+        options = _html_pretty_options(
+            skip_ws_nodes,
+            skip_comment,
+            raw,
+            without_closing,
+            tag_with_ns,
+            without_text_indent,
+            full_doctype,
+        )
+        if self._is_fragment:
+            if self.root is None:
+                return None
+            return self.root.html_pretty(
+                indent=indent,
+                skip_ws_nodes=skip_ws_nodes,
+                skip_comment=skip_comment,
+                raw=raw,
+                without_closing=without_closing,
+                tag_with_ns=tag_with_ns,
+                without_text_indent=without_text_indent,
+                full_doctype=full_doctype,
+            )
+        node = LexborNode.new(<lxb_dom_node_t *> &self.document.dom_document, self)
+        return node._serialize_html(options, <size_t> indent, True)
+
     def css(self, str query):
         """A CSS selector.
 
diff --git a/selectolax/lexbor/node.pxi b/selectolax/lexbor/node.pxi
@@ -123,6 +123,74 @@ cdef class LexborNode:
             return html
         return None
 
+    cdef inline str _serialize_html(self, lxb_html_serialize_opt_t options, size_t indent, bint pretty):
+        cdef lexbor_str_t *lxb_str
+        cdef lxb_status_t status
+
+        lxb_str = lexbor_str_create()
+        if self._is_fragment_root:
+            if pretty:
+                status = serialize_fragment_pretty(self.node, lxb_str, options, indent)
+            else:
+                status = serialize_fragment(self.node, lxb_str)
+        else:
+            if pretty:
+                status = lxb_html_serialize_pretty_tree_str(self.node, options, indent, lxb_str)
+            else:
+                status = lxb_html_serialize_tree_str(self.node, lxb_str)
+
+        if status == 0 and lxb_str.data:
+            html = lxb_str.data.decode(_ENCODING).replace('<-undef>', '')
+            lexbor_str_destroy(lxb_str, self.node.owner_document.text, True)
+            return html
+        return None
+
+    def html_pretty(
+        self,
+        Py_ssize_t indent=0,
+        bint skip_ws_nodes=False,
+        bint skip_comment=False,
+        bint raw=False,
+        bint without_closing=False,
+        bint tag_with_ns=False,
+        bint without_text_indent=False,
+        bint full_doctype=False,
+    ):
+        """Return pretty-printed HTML for the current node.
+
+        Parameters
+        ----------
+        indent : int, optional
+            Initial indentation level passed to Lexbor. Defaults to ``0``.
+        skip_ws_nodes : bool, optional
+            Skip text nodes that contain only whitespace.
+        skip_comment : bool, optional
+            Exclude HTML comment nodes from the serialized output.
+        raw : bool, optional
+            Serialize text and attribute values without HTML escaping.
+        without_closing : bool, optional
+            Omit closing tags for non-void elements.
+        tag_with_ns : bool, optional
+            Include namespace prefixes in serialized tag names when available.
+        without_text_indent : bool, optional
+            Disable extra indentation added around text and comment content.
+        full_doctype : bool, optional
+            Serialize the full document type declaration when a doctype node is present.
+        """
+        cdef lxb_html_serialize_opt_t options
+        if indent < 0:
+            raise ValueError("indent must be greater than or equal to 0")
+        options = _html_pretty_options(
+            skip_ws_nodes,
+            skip_comment,
+            raw,
+            without_closing,
+            tag_with_ns,
+            without_text_indent,
+            full_doctype,
+        )
+        return self._serialize_html(options, <size_t> indent, True)
+
     def __hash__(self):
         return self.mem_id
 
@@ -1129,6 +1197,51 @@ cdef lxb_status_t serialize_fragment(lxb_dom_node_t *node, lexbor_str_t *lxb_str
 
     return LXB_STATUS_OK
 
+
+cdef lxb_status_t serialize_fragment_pretty(
+    lxb_dom_node_t *node,
+    lexbor_str_t *lxb_str,
+    lxb_html_serialize_opt_t options,
+    size_t indent,
+):
+    cdef lxb_status_t status
+    while node != NULL:
+        status = lxb_html_serialize_pretty_tree_str(node, options, indent, lxb_str)
+        if status != LXB_STATUS_OK:
+            return status
+        node = node.next
+
+    return LXB_STATUS_OK
+
+
+cdef inline lxb_html_serialize_opt_t _html_pretty_options(
+    bint skip_ws_nodes,
+    bint skip_comment,
+    bint raw,
+    bint without_closing,
+    bint tag_with_ns,
+    bint without_text_indent,
+    bint full_doctype,
+):
+    cdef lxb_html_serialize_opt_t options = LXB_HTML_SERIALIZE_OPT_UNDEF
+
+    if skip_ws_nodes:
+        options = <lxb_html_serialize_opt_t> (options | LXB_HTML_SERIALIZE_OPT_SKIP_WS_NODES)
+    if skip_comment:
+        options = <lxb_html_serialize_opt_t> (options | LXB_HTML_SERIALIZE_OPT_SKIP_COMMENT)
+    if raw:
+        options = <lxb_html_serialize_opt_t> (options | LXB_HTML_SERIALIZE_OPT_RAW)
+    if without_closing:
+        options = <lxb_html_serialize_opt_t> (options | LXB_HTML_SERIALIZE_OPT_WITHOUT_CLOSING)
+    if tag_with_ns:
+        options = <lxb_html_serialize_opt_t> (options | LXB_HTML_SERIALIZE_OPT_TAG_WITH_NS)
+    if without_text_indent:
+        options = <lxb_html_serialize_opt_t> (options | LXB_HTML_SERIALIZE_OPT_WITHOUT_TEXT_INDENT)
+    if full_doctype:
+        options = <lxb_html_serialize_opt_t> (options | LXB_HTML_SERIALIZE_OPT_FULL_DOCTYPE)
+
+    return options
+
 cdef inline bint _is_node_type(lxb_dom_node_t *node, lxb_dom_node_type_t expected_type):
     return node != NULL and node.type == expected_type
 
diff --git a/tests/test_lexbor.py b/tests/test_lexbor.py
@@ -2,8 +2,10 @@
 
 from inspect import cleandoc
 
+import pytest
 
-from selectolax.lexbor import LexborHTMLParser, parse_fragment, SelectolaxError
+
+from selectolax.lexbor import LexborHTMLParser, SelectolaxError, parse_fragment
 
 
 def clean_doc(text: str) -> str:
@@ -27,6 +29,58 @@ def test_sets_inner_html():
     assert actual == expected
 
 
+def test_html_pretty_document():
+    parser = LexborHTMLParser("<div><span>Hello</span><!-- note --></div>")
+    assert parser.html_pretty() == clean_doc(
+        """
+        <html>
+          <head>
+          </head>
+          <body>
+            <div>
+              <span>
+                "Hello"
+              </span>
+              <!--  note  -->
+            </div>
+          </body>
+        </html>
+        """
+    )
+
+
+def test_html_pretty_node_with_options():
+    parser = LexborHTMLParser("<div><span>Hello</span><!-- note --></div>")
+    node = parser.css_first("div")
+    assert node.html_pretty(skip_comment=True) == clean_doc(
+        """
+        <div>
+          <span>
+            "Hello"
+          </span>
+        </div>
+        """
+    )
+
+
+def test_html_pretty_skip_ws_nodes_option():
+    parser = LexborHTMLParser("<div>\n</div><span></span>", is_fragment=True)
+    assert parser.html_pretty(skip_ws_nodes=True) == clean_doc(
+        """
+        <div>
+        </div>
+        <span>
+        </span>
+        """
+    )
+
+
+def test_html_pretty_rejects_negative_indent():
+    parser = LexborHTMLParser("<div>Hello</div>")
+    with pytest.raises(ValueError):
+        parser.html_pretty(indent=-1)
+
+
 def test_checking_attributes_does_not_segfault():
     parser = LexborHTMLParser("")
     root_node = parser.root
diff --git a/tests/test_lexbor_fragment.py b/tests/test_lexbor_fragment.py
@@ -149,6 +149,31 @@ def test_fragment_root_html_serialization():
     assert p.html == "<div>Hello!</div><span>World</span>"
 
 
+def test_fragment_root_html_pretty_serialization():
+    html = "<div><span>Hello</span></div>\n<span>World</span>"
+    p = LexborHTMLParser(html, is_fragment=True)
+    assert p.root.html_pretty(skip_ws_nodes=True) == clean_doc(
+        """
+        <div>
+          <span>
+          </span>
+        </div>
+        <span>
+        </span>
+        """
+    )
+    assert p.html_pretty(skip_ws_nodes=True) == clean_doc(
+        """
+        <div>
+          <span>
+          </span>
+        </div>
+        <span>
+        </span>
+        """
+    )
+
+
 def test_fragment_node_properties():
     html = "<div>Hello</div><span>World</span>"
     p = LexborHTMLParser(html, is_fragment=True)